def writeBureauHistoricalData(multiSourceObject, sourceDataFolderName=None, bureauSET=None, alterStillWork=None):

    """
    PART (1)-1
    歷史檔案 能源局產品 寫入bureau_energy_products_backup 表格-------------------------------------

    This session is one-time procedure!   No need to do it again if done.
    """

    if not alterStillWork:
        begin = timeCalculate()
        multiSourceObject.writeHistoricalDataIntoDB(multiSourceObject.bureauEnergyM, 
                            sourceDataFolderName=sourceDataFolderName, bureauSET=bureauSET)
        end = timeCalculate()
        print(f"歷史檔案 {sourceDataFolderName} 花費:", end-begin, "秒。")

    else:

        """
        PART (1)-2
        最新檔案 能源局產品  寫入bureau_energy_products_backup 表格-------------------------------------

        This session is alternative;We are allowed to execute programs here or at "insertLatestDataIntoDB.py  PART (2)-2"。
        What worth giving attention is whether historical data in DB at first?
        """


        begin = timeCalculate()
        multiSourceObject.bureauEnergyM.alterStillWorkToZero(alterStillWork)
        multiSourceObject.writeHistoricalDataIntoDB(multiSourceObject.bureauEnergyM, 
                            sourceDataFolderName=sourceDataFolderName, bureauSET=bureauSET, latestBoolean=1)
        end = timeCalculate()
        print(f"最新檔案 {sourceDataFolderName} 花費:", end-begin, "秒。")
Example #2
0
def getPageInARaw(input, _headers, objectiveFolder, objective, *args):
    begin = timeCalculate()
    thisPID = os.getpid()
    while True:
        print(thisPID, "===========================================")
        consecutiveUrl = input.get()
        year, month = consecutiveUrl.split("+")

        url = f"https://www.cwb.gov.tw/V8/C/C/Statistics/MonthlyData/MOD/{year}_{month}.html"
        res = requests.get(url, headers=_headers)
        timeSleepRandomly()
        res.encoding = "utf-8"

        soup = BeautifulSoup(res.text, 'html.parser')

        with open(
                f"{_BASE_PATH}/dataMunging/{objectiveFolder}/{objective}/{year}/{month}_{year}.txt",
                'w',
                encoding='utf-8') as f:
            f.write(str(soup))
        print()
        print(f'{thisPID}  成功寫出  {month}_{year}.txt ')

        input.task_done()
        end = timeCalculate()
        print(f'{thisPID}_getPageInARaw 累計耗時:{end-begin} 秒')
        timeSleepOne()
Example #3
0
def writeNewsTitle(multiSourceObject, sourceDataFolderName=None, historyFile=None):
    """
    PART (5)-1
    將 新聞標題 歷史資料 寫入 news_title_from_selenium 表格-------------------------------------
    This session is one-time procedure!  No need to do it again if done.
    """
    if historyFile:
        begin = timeCalculate()

        multiSourceObject.newsM.truncateTable(1, sourceDataFolderName=sourceDataFolderName)
        multiSourceObject.writeLatestDataIntoDB(multiSourceObject.newsM, 
                                        sourceDataFolderName=sourceDataFolderName)
        end = timeCalculate()
        print(f"歷史檔案 {sourceDataFolderName} 花費:", end-begin, "秒。")

    else:
        """
        PART (5)-2
        將 新聞標題 最新的資料 寫入 news_title_from_selenium 表格-------------------------------------
        This session is one-time procedure and have to be done repeatedly if having the latest files.
        """
        begin = timeCalculate()

        multiSourceObject.writeLatestDataIntoDB(multiSourceObject.newsM, 
                                        sourceDataFolderName=sourceDataFolderName, latestBoolean=1)
        end = timeCalculate()
        print(f"最新檔案 {sourceDataFolderName} 花費:", end-begin, "秒。")
Example #4
0
def writeBureauData(multiSourceObject, sourceDataFolderName=None, bureauSET=None, alterStillWork=None):

    """
    PART (2)-1
    寫入 能源局產品 到 bureau_energy_products 表格-------------------------------------
    This session is one-time procedure and have to be done repeatedly if having the latest files.
    """
    if not alterStillWork: # 塞 最新的表
        begin = timeCalculate()
        multiSourceObject.bureauEnergyM.truncateTable(1, sourceDataFolderName=sourceDataFolderName)
        multiSourceObject.writeLatestDataIntoDB(multiSourceObject.bureauEnergyM, 
                        sourceDataFolderName=sourceDataFolderName, bureauSET=bureauSET)
        end = timeCalculate()
        print(f"最新檔案 {sourceDataFolderName} 花費:", end-begin, "秒。")
    else: # 塞 backup表
        """
        PART (2)-2
        將最新的 能源局產品 檔案寫入bureau_energy_products_backup 表格
        This session is alternative;Please refer to 'inserHistoricalDataIntoDB.py  PART (1)-2".
        """
        begin = timeCalculate()
        multiSourceObject.bureauEnergyM.alterStillWorkToZero(alterStillWork)
        multiSourceObject.writeHistoricalDataIntoDB(multiSourceObject.bureauEnergyM, 
                            sourceDataFolderName=sourceDataFolderName, bureauSET=bureauSET, latestBoolean=1)
        end = timeCalculate()
        print(f"最新檔案 {sourceDataFolderName} 寫入backup表格 花費:", end-begin, "秒。")
Example #5
0
def writeNewsContent(multiSourceObject, sourceDataFolderName=None, historyFile=None):
    """
    PART (6)-1
    將 篩選的新聞 歷史資料 寫入 selected_news_with_tfidf 表格-------------------------------------
    This session is one-time procedure!  No need to do it again if done.
    """
    if historyFile:

        begin = timeCalculate()

        multiSourceObject.newsWithContentM.truncateTable(1, sourceDataFolderName=sourceDataFolderName)
        multiSourceObject.writeLatestDataIntoDB(multiSourceObject.newsWithContentM, 
                                        sourceDataFolderName=sourceDataFolderName)

        end = timeCalculate()
        print(f"歷史檔案 {sourceDataFolderName} 花費:", end-begin, "秒。")

    else:
        """
        PART (6)-2
        將 篩選的新聞 最新的資料 寫入 selected_news_with_tfidf 表格-------------------------------------
        This session is one-time procedure and have to be done repeatedly if having the latest files.
        """

        begin = timeCalculate()

        multiSourceObject.writeLatestDataIntoDB(multiSourceObject.newsWithContentM,
                                    sourceDataFolderName=sourceDataFolderName, latestBoolean=1)

        end = timeCalculate()
        print(f"最新檔案 {sourceDataFolderName} 花費:", end-begin, "秒。")
Example #6
0
def writeReferenceData(tableClassBase, engine):
    """
    PART (1)
    寫入 referenceFiles-------------------------------------
    This session is one-time procedure! No need to do it again if done.
    """
    begin = timeCalculate()
    referenceFiles().writeReferenceDataIntoDB(tableClassBase, engine)
    end = timeCalculate()

    print("referenceFiles完成:", end-begin, "秒。")
Example #7
0
def writeWeatherData(multiSourceObject, sourceDataFolderName=None):
    """
    PART (4)
    將氣象歷史資料 寫入  weather_records_by_months 表格-------------------------------------
    This session is one-time procedure  and have to be done repeatedly if having the latest files.
    """
    begin = timeCalculate()

    multiSourceObject.weatherM.truncateTable(1)
    multiSourceObject.writeLatestDataIntoDB(multiSourceObject.weatherM, 
                                    sourceDataFolderName=sourceDataFolderName)
    end = timeCalculate()
    print(f"最新檔案 {sourceDataFolderName} 花費:", end-begin, "秒。")
Example #8
0
def getPageInARow(input, headers, objectiveFolder, objective, *args):
    thisPID = os.getpid()
    while True:
        # print(thisPID,"===========================================")
        consecutiveUrl = input.get()
        searchword, correctUrl, txtFileRoute = consecutiveUrl.split("+")

        fileName = txtFileRoute.split("/")[-1]
        page = fileName.split("_")[0]
        totalPage = fileName.split("_")[1]

        # print('getPageInARow is in new process %s, %s ' % (getPageInARow_proc, os.getpid()))
        # print('------接下來要處理 ' + searchword + '第' ,page, '頁---------共', totalPage, '頁')

        timeSleepOne()
        timeSleepRandomly()
        res = requests.get(correctUrl, headers=headers)
        res.encoding = 'utf-8'

        timeSleepRandomly()

        soup = BeautifulSoup(res.text, 'html.parser')

        with open(txtFileRoute, 'w', encoding='utf-8') as f:
            f.write(str(soup))
        print(f"成功寫出  {searchword}  第 {page} 頁, 共 {totalPage} 頁。")
        end = timeCalculate()
        print('getPageInARow 累計耗時:{0} 秒'.format(end - begin))
        input.task_done()  #通知main process此次的input處理完成!
        timeSleepOne()  #暫停幾秒來模擬現實狀況。
Example #9
0
def distributeMonthAvailable(input, output, _weatherRecordAvailable,
                             objectiveFolder, objective, *args):
    begin = timeCalculate()
    thisPID = os.getpid()
    while True:
        print(thisPID, "===========================================")
        year = input.get()
        monthsAvailable = _weatherRecordAvailable[year]

        eraseRawData(objectiveFolder, objective, year)
        mkdirForRawData(objectiveFolder, objective, year)

        for month in monthsAvailable:
            consecutiveData = year + "+" + month
            output.put(consecutiveData)
            print(
                f'這裡是distributeMonthAvailable,準備送給  getPageInARow  處理: {year}年_{month}月 '
            )
        input.task_done()
        end = timeCalculate()
        print(f'{thisPID}_distributeMonthAvailable 累計耗時:{end-begin} 秒')
        timeSleepOne()
def overviewUriDistributor(input, output, keywordUrlPair, headers, dirRoute,
                           objectiveFolder, objective, *args):
    thisPID = os.getpid()
    while True:
        print(thisPID, "===========================================")
        searchword = input.get()
        url = keywordUrlPair[searchword]
        totalPage = getPageFirst(url + "1", headers)

        print('overviewUriDistributor is in new process %s, %s ' %
              (overviewUriDistributor_proc, os.getpid()))
        print('------接下來要發送 ' + searchword + ' 的overviewUri---------', '共',
              totalPage, '頁')

        #莫把檢查資料夾的工作放到爬蟲時才做,那樣會對資料夾開開刪刪。
        eraseRawData(objectiveFolder,
                     objective,
                     searchword,
                     keyword="overview")
        mkdirForRawData(objectiveFolder,
                        objective,
                        searchword,
                        keyword="overview")

        for page in range(1, int(totalPage) + 1):
            correctUrl = url + str(page)

            readyTxtFileRoute = dirRoute + f"{searchword}/overview/{page}_{totalPage}_{searchword}.txt"
            #TypeError: must be str, not tuple
            consecutiveData = searchword + "+" + correctUrl + "+" + readyTxtFileRoute

            output.put(consecutiveData)
        print(
            f'這裡是 overviewUriDistributor_{thisPID},準備送給  getPageInARow  處理 {totalPage} 頁的 overviewUri'
        )
        print()

        end = timeCalculate()
        print('overviewUriDistributor 累計耗時:{0} 秒'.format(end - begin))
        input.task_done()  #通知main process此次的input處理完成!
        timeSleepOne()  #暫停幾秒來模擬現實狀況。
Example #11
0
def writeEcommerceData(multiSourceObject, sourceDataFolderName_1=None, sourceDataFolderName_2=None, alterStillWork=None):

    """
    PART (3)-1
    寫入 電商商品 到 ecommerce_products 表格-------------------------------------
    This session is one-time procedure and have to be done repeatedly if having the latest files.
    """
    if not alterStillWork:  # 塞 最新的表
        begin = timeCalculate()
        multiSourceObject.ecommerceM.truncateTable(1, sourceDataFolderName=sourceDataFolderName_1)
        multiSourceObject.writeLatestDataIntoDB(multiSourceObject.ecommerceM,
                        sourceDataFolderName=sourceDataFolderName_1)
        end = timeCalculate()
        print(f"最新檔案 {sourceDataFolderName_1} 花費:", end-begin, "秒。")

        begin = timeCalculate()
        multiSourceObject.writeLatestDataIntoDB(multiSourceObject.ecommerceM, 
                        sourceDataFolderName=sourceDataFolderName_2)
        end = timeCalculate()
        print(f"最新檔案 {sourceDataFolderName_2} 花費:", end-begin, "秒。")
    
    else:  # 塞 backup表

        """
        PART (3)-2
        將最新的 電商商品 檔案寫入 ecommerce_products_backup 表格 ----------連同價格異動的情況送進 ecommerce_products_price_records資料表--------------------------
        This session is alternative;Please refer to 'inserHistoricalDataIntoDB.py PART (2)-2".
        """

        begin = timeCalculate()

        multiSourceObject.ecommerceM.alterStillWorkToZero(alterStillWork)

        #(1)記憶體操作  參考sqlDMLAndsqlAlchemyORM.py
        # multiSourceObject.writeHistoricalDataIntoDB(multiSourceObject.ecommerceM, 
        #                                 sourceDataFolderName=sourceDataFolderName_1, latestBoolean=1)
        # end = timeCalculate()
        # print(f"最新檔案 {sourceDataFolderName_1}, {sourceDataFolderName_2} 花費:", end-begin, "秒。")


        #(2)本地檔案操作  參考sqlDMLAndsqlAlchemyORM.py
        multiSourceObject.writeHistoricalDataIntoDB(multiSourceObject.ecommerceM, 
                                        sourceDataFolderName=sourceDataFolderName_1, latestBoolean=1)

        multiSourceObject.writeHistoricalDataIntoDB(multiSourceObject.ecommerceM, 
                                        sourceDataFolderName=sourceDataFolderName_2, latestBoolean=1)
        end = timeCalculate()
        print(f"最新檔案 {sourceDataFolderName_1}, {sourceDataFolderName_2} 花費:", end-begin, "秒。")
def writeEcommerceHistoricalData(multiSourceObject, sourceDataFolderName_1=None, sourceDataFolderName_2=None, alterStillWork=None):
    
    
    """
    PART (2)-1
    歷史檔案 電商產品 寫入 ecommerce_products_backup 表格 
    ----------連同價格異動的情況送進 ecommerce_products_price_records資料表--------------------------
    This session is one-time procedure!   No need to do it again if done.
    """
    if not alterStillWork:
        begin = timeCalculate()
        multiSourceObject.writeHistoricalDataIntoDB(multiSourceObject.ecommerceM, 
                                        sourceDataFolderName=sourceDataFolderName_1)
        end = timeCalculate()
        print(f"歷史檔案 {sourceDataFolderName_1} 花費:", end-begin, "秒。")


        begin = timeCalculate()
        multiSourceObject.writeHistoricalDataIntoDB(multiSourceObject.ecommerceM, 
                                        sourceDataFolderName=sourceDataFolderName_2)
        end = timeCalculate()
        print(f"歷史檔案 {sourceDataFolderName_2} 花費:", end-begin, "秒。")

    else:
        """
        PART (2)-2
        最新檔案 電商產品 寫入 ecommerce_products_backup 表格 
        -----------連同價格異動的情況送進 ecommerce_products_price_records資料表--------------------------
        This session is alternative;We are allowed to execute programs here or at "insertLatestDataIntoDB.py  PART (3)-2"。
        What worth giving attention is whether historical data in DB at first?
        """
        begin = timeCalculate()

        multiSourceObject.ecommerceM.alterStillWorkToZero(alterStillWork)
        multiSourceObject.writeHistoricalDataIntoDB(multiSourceObject.ecommerceM, 
                                        sourceDataFolderName=sourceDataFolderName_1, latestBoolean=1)
        end = timeCalculate()
        print(f"最新檔案 {sourceDataFolderName_1}, {sourceDataFolderName_2} 花費:", end-begin, "秒。")
def dataMunging(input, output, dirRoute,objectiveFolder, objective, domainUrl, *args):
    thisPID = os.getpid()
    energyLabelUrl = "https://ranking.energylabel.org.tw/_Upload/applyMain/applyp/"
    bureauReplace = bureauEnergyReplace()
    while True:
        print(thisPID,"===========================================")
        searchword = input.get() 
        dirNameAccepted = dirRoute + f"{searchword}/overview/"
        dirNameWriteOut = dirRoute + f"{searchword}/"

        #莫把檢查資料夾的工作放到爬蟲時才做,那樣會對資料夾開開刪刪。
        eraseRawData(objectiveFolder, objective, searchword, keyword="jsonIntegration")
        mkdirForRawData(objectiveFolder, objective, searchword, keyword="jsonIntegration")

        print('dataMunging is in new process %s, %s ' % (dataMunging_proc, thisPID))
        print()
        print('------接下來要處理資料夾路徑「 ' + dirNameAccepted + '」---------')
        print()
        
        if not os.listdir(dirNameAccepted):
            print(f"============={objective} {searchword} 資料夾沒有東西,此進程準備結束。=============")
            input.task_done()
            timeSleepOne()
            print("========================break========================")
            break

        bureauEnergyDict = {}
        productArray= [] 
        
        for file in initialFileZeroUnderscoreInt(dirNameAccepted):
            # print(" start " + file + " ! ")
                
            with open(dirNameAccepted + file)as f:
                inn = f.read()
            
            # 處理soup=""的情況
            if not inn:
              continue
            
            textSoup = BeautifulSoup(inn,'html.parser')

            a = 0
            b = 7

            for i in range(10): #每頁有十項,每7個元素一組
                oneset = textSoup.find_all('div',{'class':'col-md-12 column'})[-1].find_all('td',{'align':'left'})[a:b]
                if oneset != []:
                    
                    detailUrl =  domainUrl + oneset[2].a.attrs.get('href')
                    
                    parseUrl = urlparse(detailUrl)
                    qsDict = parse_qs(parseUrl.query)
                    p1 = qsDict['id'].pop() #id是p1
                    p0 = qsDict['p0'].pop()
                    
                    productDict = {}
                    productDict['Id'] = p1 #oneset[2].a.attrs.get('href').split('id=')[1]
                    #  檔案裡面有髒值  冰箱"product_model": "B23KV-81RE\n", "IB 7030 F TW"     空調"product_model": "PAX-K500CLD ",
                    productDict['product_model'] = bureauReplace.productModel(oneset[0].text)
                    productDict['brand_name'] = oneset[1].text
                    productDict['login_number'] = oneset[2].text
                    productDict['detailUri'] = detailUrl
                    productDict['labeling_company'] = oneset[3].text
                    productDict['efficiency_rating'] = oneset[4].text
                    productDict['from_date_of_expiration'] = bureauReplace.date(oneset[5].text)
                    
                    # 我們可以組裝outerUri
                    # https://ranking.energylabel.org.tw/product/Approval/file_list.aspx?p1=20901&p0=82409
                    productDict['energy_efficiency_label_outerUri'] = f"{domainUrl}file_list.aspx?p1={p1}&p0={p0}"
                    
                    # 我們想要的InnerUri
                    # https://ranking.energylabel.org.tw/_Upload/applyMain/applyp/20901/SB_photo1/EF2R-13DEX1.jpg
                    # productDict['energy_efficiency_label_innerUri'] = ... 因為這邊要做判斷,因此在 「bureauEnergyMunging.py」再處理,以不影響爬蟲的進度。


                    productArray.append(productDict)

                    a += 7
                    b += 7
                    # print('done ' + file + ' 的第' + str(i+1) + '份。')
                else:
                    print('在 ' + file + ' 的第' + str(i+1) + '處,發現空值!')
                    break
            
        bureauEnergyDict['product'] = productArray
        bureauEnergyDict['keyword'] = searchword
        timeStamp = timeStampGenerator()
        bureauEnergyDict["dateTime"] = timeStamp

        totalNums = len(bureauEnergyDict['product'])
        
        with open(dirNameWriteOut + f"jsonIntegration/{objective}_overview_{timeStamp}_{totalNums}_{searchword}.json","w",encoding="utf-8")as f:
            json.dump(bureauEnergyDict, f, indent=2, ensure_ascii=False)
        
        print(f'這裡是 dataMunging ,處理{searchword}完成: ' + dirNameWriteOut + "jsonIntegration/")


        # ========= 如果只想要洗 overview html,此區可以註解掉。==========
        # 莫把檢查資料夾的工作放到爬蟲時才做,那樣會對資料夾開開刪刪。
        eraseRawData(objectiveFolder, objective, searchword, keyword="detail")
        mkdirForRawData(objectiveFolder, objective, searchword, keyword="detail")
        
        productIndex = 1
        for file in bureauEnergyDict['product']:
            detailUri = file['detailUri']
            readyTxtFileRoute = dirNameWriteOut + f"detail/{productIndex}_{totalNums}_{searchword}.txt"
            
            #TypeError: must be str, not tuple
            consecutiveData = searchword + "+" + detailUri + "+" + readyTxtFileRoute

            output.put(consecutiveData)
            # print('這裡是 dataMunging,準備送給  detailPageInARow  處理: ' + consecutiveData)
            # print()            
            productIndex += 1
        # ========= ================================



        end = timeCalculate()
        print('dataMunging 累計耗時:{0} 秒'.format(end-begin))
        input.task_done()
        timeSleepOne() #暫停幾秒來模擬現實狀況。
Example #14
0
def getPageInARow(input, output, keywordUrlPair, objectiveFolder, objective):
    thisPID = os.getpid()
    while True:
        print(thisPID, "===========================================")
        searchword = input.get()
        print('getPageInARow is in new process %s, %s ' %
              (getPageInARow_proc, thisPID))
        print()
        eraseRawData(objectiveFolder, objective, searchword)
        mkdirForRawData(objectiveFolder, objective, searchword)

        url = keywordUrlPair[searchword]

        # 建立browser的代碼放進while True裡面,就可以避免「同一個瀏覽器」持續拜訪網頁時,被拒絕的情況。
        for i in range(3):
            try:
                timeSleepOne()
                timeSleepRandomly()

                browser = buildSplinterBrowserHeadless('chrome')

                timeSleepRandomly()

                browser.visit(url)

                browserWaitTime(browser)
                timeSleepTwo()

                tempHtml = browser.html

                timeSleepRandomly()
                soup = BeautifulSoup(tempHtml, 'html.parser')
                print(
                    f"讀取{searchword}第 1 頁>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>成功!"
                )
                break
            except (ConnectionRefusedError, TimeoutException,
                    WebDriverException) as e:
                print(
                    f"{thisPID}__{getPageInARow_proc}  讀取{searchword}第 1 頁有問題。",
                    e)
                print(
                    f"{thisPID}__{getPageInARow_proc}  重建browser物件,進行再處理 {i} 次!"
                )
                timeSleepFour()
                timeSleepRandomly()
                soup = ""
            # else:
            #     print(f"讀取{searchword}第 1 頁>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>成功!")

        try:
            totalPage = interDiv(searchNums(soup.select_one('.totalTxt').text),
                                 30)
        except AttributeError as e:
            print("getPageInARow 出錯", e)
            # 讓程式強制停下來
            raise

        print('------接下來要處理 ' + searchword + ' 的頁數---------', totalPage, '頁')
        print()

        with open(
                f"{_BASE_PATH}/dataMunging/{objectiveFolder}/{objective}/{searchword}/1_{totalPage}_{searchword}.txt",
                'w',
                encoding='utf-8') as f:
            f.write(str(soup))
        print()
        print(f'成功寫出  {searchword}  第 1 頁')

        i_browser = 1
        try:
            browser.quit()
            print(
                f"成功關閉 browser{getPageInARow_proc}++++++++++++++++++++++++++++++"
            )
        except:
            print(
                f"放棄 {thisPID}__{getPageInARow_proc} 的 第{i_browser}個browser。")
            i_browser += 1
            print(
                f"kill {thisPID}__{getPageInARow_proc} >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>"
            )
            os.kill(thisPID, signal.SIGKILL)

        # 休息久一點,讓所有searchword的第一頁都有被讀到。
        timeSleepEight()
        timeSleepEight()

        for num in range(2, totalPage + 1):
            strNum = str(num)
            consecutiveData = searchword + "+" + strNum + "+" + str(
                totalPage) + "+" + re.sub(r"curPage=1", f"curPage={strNum}",
                                          url)
            output.put(consecutiveData)
            # print(f'這裡是getPageInARow,準備送給  getPageInARowAdvanced  處理:  {searchword} 的 第 {strNum} 頁,總共{totalPage}')
            print()
        input.task_done()  #通知main process此次的input處理完成!
        end = timeCalculate()
        print(f'{thisPID}__getPageInARow 累計耗時:{end-begin} 秒')
Example #15
0
def dataMunging(input, dirRoute, objectiveFolderClean, objective):
    # begin = timeCalculate()
    thisPID = os.getpid()
    bureauMunging = bureauEnergyMunging()
    while True:
        print(thisPID,"===========================================")
        searchword = input.get()

        dirNameCheck = dirRoute + f"{searchword}/"
        directory = dirRoute + f"{searchword}/detail/"
        dirNameWriteOut = dirRoute + f"{searchword}/jsonIntegration/"

        print('dataMunging is in new process %s, %s ' % (dataMunging_proc, thisPID))
        print()
        print('------接下來要處理資料夾路徑「 ' + dirNameWriteOut  + '」---------')
        print()


        mkdirForCleanData(objectiveFolderClean, objective)

        if not os.listdir(dirNameCheck):
            print(f"============={objective} {searchword} 資料夾沒有東西,此進程準備結束。=============")
            input.task_done()
            timeSleepOne()
            print("========================break========================")
            break

        # 此區已經採用簡化的寫法,因此若洗資料都無問題,那麼就可以刪除了。
        # if searchword == "除濕機":
        #     bureauEnergyDetail, totalNums = bureauMunging.detailDehumidification(searchword, directory)
        # elif searchword == "無風管空氣調節機":
        #     bureauEnergyDetail, totalNums = bureauMunging.detailAirConditioner(searchword, directory)
        # elif searchword == "電冰箱":
        #     bureauEnergyDetail, totalNums = bureauMunging.detailRefrigerator(searchword, directory)
        # elif searchword == "電熱水瓶":
        #     bureauEnergyDetail, totalNums = bureauMunging.detailElectricWarmer(searchword, directory)
        # elif searchword == "溫熱型開飲機":
        #     bureauEnergyDetail, totalNums = bureauMunging.detailWarmDrinkMachine(searchword, directory)
        # elif searchword == "溫熱型飲水機":
        #     bureauEnergyDetail, totalNums = bureauMunging.detailWarmDispenser(searchword, directory)
        # elif searchword == "冰溫熱型開飲機":
        #     bureauEnergyDetail, totalNums = bureauMunging.detailColdWarmDrinkMachine(searchword, directory)
        # elif searchword == "冰溫熱型飲水機":
        #     bureauEnergyDetail, totalNums = bureauMunging.detailColdWarmDispenser(searchword, directory)
        # elif searchword == "貯備型電熱水器":
        #     bureauEnergyDetail, totalNums = bureauMunging.detailStorageWaterHeaters(searchword, directory)
        # elif searchword == "瓦斯熱水器":
        #     bureauEnergyDetail, totalNums = bureauMunging.detailGasWaterHeaters(searchword, directory)
        # elif searchword == "瓦斯爐":
        #     bureauEnergyDetail, totalNums = bureauMunging.detailGasStove(searchword, directory)
        # elif searchword == "安定器內藏式螢光燈泡":
        #     bureauEnergyDetail, totalNums = bureauMunging.detailCompactFluorescentLamp(searchword, directory)

        # '無風管空氣調節機', '除濕機', '電冰箱', '電熱水瓶', '溫熱型開飲機',
        # '溫熱型飲水機', '冰溫熱型開飲機', '冰溫熱型飲水機', '貯備型電熱水器' , '瓦斯熱水器', '瓦斯爐', '安定器內藏式螢光燈泡'
        bureauEnergyDetail, totalNums = bureauMunging.detailMungingEntry(searchword, directory)

        with open(dirNameWriteOut + f"{objective}_detail_{timeStampGenerator()}_{totalNums}_{searchword}.json",'w',encoding='utf-8')as f:
            json.dump(bureauEnergyDetail, f, indent=2, ensure_ascii=False)

        # 找出 overviewJsonFile ,開始與detailJsonFile合併:
        overviewJsonFile = [overviewFile for overviewFile in os.listdir(dirNameWriteOut) if "bureauEnergy_overview" in overviewFile].pop()
        with open(dirNameWriteOut + overviewJsonFile)as f:
            bureauEnergyOverview = json.load(f)

        modelPool = [comparedValue['product_model'] for comparedValue in bureauEnergyDetail['productDetail']]
        modelPoolDict = { v: k  for k, v in enumerate(modelPool)}


        #打開overviewJson檔案,為每個產品增加欄位。  
        for jsonObject in bureauEnergyOverview['product']:
            index, test_report_of_energy_efficiency, benchmark, annual, labelUri = zipJsonObject(modelPoolDict, jsonObject['product_model'], bureauEnergyDetail)
            
            # print('正在處理索引值: '+str(index))
            jsonObject['test_report_of_energy_efficiency'] = test_report_of_energy_efficiency
            jsonObject['efficiency_benchmark'] = benchmark
            jsonObject['annual_power_consumption_degrees_dive_year'] = annual
            jsonObject['energy_efficiency_label_innerUri'] = labelUri
            # print('done '+str(index))

        # 新增欄位的Json檔案更新時間。
        timeStamp = timeStampGenerator()
        bureauEnergyOverview["dateTime"] = timeStamp
        
        with open(f"{_BASE_PATH}/dataMunging/{objectiveFolderClean}/{objective}/{objective}_{timeStamp}_{totalNums}_{searchword}.json",'w',encoding='utf-8')as f:
            json.dump(bureauEnergyOverview, f, indent=2, ensure_ascii=False)

        statistic.append(totalNums)

        print(f"這裡是dataMunging_{thisPID},準備完成工作。 ")
        print()
        end = timeCalculate()
        print('dataMunging 累計耗時:{0} 秒'.format(end-begin))
        input.task_done()  #通知main process此次的input處理完成!
        timeSleepOne() #暫停幾秒來模擬現實狀況。
def detailPageInARow(input,  headers, objectiveFolder, objective, *args):
    """
    As many as 28,000 detail urls we are supposed to crawl would inevitalby leave some processes to fail to get the correct responses.
    As such, we should extend more time while crawling , or establish exception handler in porgrams.
    
    """
    # begin = timeCalculate()
    thisPID = os.getpid()
    while True:
        # print(thisPID,"===========================================")
        
        consecutiveUrl = input.get()
        searchword, url, txtFileRoute = consecutiveUrl.split("+")
        
        # print('detailPageInARow is in new process %s, %s ' % (detailPageInARow_proc, thisPID))
        # print()

        for i in range(4):
          if i <=2:
            try:
              timeSleepTwo()
              res = requests.get(url, headers=headers)
              res.encoding = 'utf-8'
              timeSleepRandomly()
              soup  = BeautifulSoup(res.text,'html.parser')
              break
            except requests.exceptions.ConnectionError as e:
              print(url, "發生問題。", e)
              print()
              timeSleepRandomly()
              timeSleepTwo()
              timeSleepTwo()
              soup = ""
          else:
            try:
              timeSleepEight()
              res = requests.get(url, headers=headers)
              res.encoding = 'utf-8'
              timeSleepRandomly()
              soup  = BeautifulSoup(res.text,'html.parser')
              break
            except requests.exceptions.ConnectionError as e:
              print(txtFileRoute, "發生問題。", e)
              print()
              soup = ""
        
        # 若觸發第2個狀況,則強命為空字串。
        if judgeSoup(soup, searchword, url, txtFileRoute) == "check":
          soup = ""
        


        with open(txtFileRoute, 'w', encoding='utf-8')as f:
            f.write(str(soup))
        
        fileName = txtFileRoute.split("/")[-1]
        productIndex = fileName.split("_")[0]
        productNums = fileName.split("_")[1]
        print(f"{thisPID}__成功寫出  {searchword}  detail頁, 第 {productIndex} 項, 共 {productNums} 項。")
            
        timeSleepRandomly()

        # print('這裡是 detailPageInARow 完成: ' + fileName + " 的爬取。")
        end = timeCalculate()
        # print('detailPageInARow 累計耗時:{0} 秒'.format(end-begin))
        input.task_done()
def getPageInARow(input, headers, objectiveFolder, objective, *args):
    thisPID = os.getpid()
    while True:
        # print(thisPID,"===========================================")
        consecutiveUrl = input.get()
        searchword, correctUrl, txtFileRoute = consecutiveUrl.split("+")

        fileName = txtFileRoute.split("/")[-1]
        page = fileName.split("_")[0]
        totalPage = fileName.split("_")[1]

        # print('getPageInARow is in new process %s, %s ' % (getPageInARow_proc, os.getpid()))
        # print('------接下來要處理 ' + searchword + '第' ,page, '頁---------共', totalPage, '頁')

        for i in range(4):
            if i <= 2:
                try:
                    timeSleepRandomly()
                    res = requests.get(correctUrl, headers=headers)
                    res.encoding = 'utf-8'
                    timeSleepRandomly()
                    timeSleepOne()
                    soup = BeautifulSoup(res.text, 'html.parser')
                    break
                except requests.exceptions.ConnectionError as e:
                    print(fileName, "發生問題。", i, e)
                    print()
                    timeSleepRandomly()
                    timeSleepTwo()
                    soup = ""
            else:
                try:
                    timeSleepEight()
                    timeSleepRandomly()
                    res = requests.get(correctUrl, headers=headers)
                    res.encoding = 'utf-8'
                    timeSleepRandomly()
                    soup = BeautifulSoup(res.text, 'html.parser')
                    break
                except requests.exceptions.ConnectionError as e:
                    print(fileName, "發生問題。", i, e)
                    print()
                    soup = ""

        # 若觸發第2個狀況,則強命為空字串。
        if judgeSoup(soup, searchword, correctUrl, txtFileRoute) == "check":
            soup = ""

        # 原來
        # timeSleepOne()
        # timeSleepRandomly()
        # res = requests.get(correctUrl, headers=headers)
        # res.encoding = 'utf-8'
        # timeSleepRandomly()
        # soup  = BeautifulSoup(res.text,'html.parser')

        with open(txtFileRoute, 'w', encoding='utf-8') as f:
            f.write(str(soup))
        # print(f"成功寫出  {searchword}  第 {page} 頁, 共 {totalPage} 頁。")
        end = timeCalculate()
        # print('getPageInARow 累計耗時:{0} 秒'.format(end-begin))
        input.task_done()  #通知main process此次的input處理完成!
Example #18
0
def getPageInARowAdvanced(input, folderWorker, momoMallBrowser):
    """
    開始對POST網址進行splinter的點擊
    """
    thisPID = os.getpid()
    while True:
        # print(thisPID,"===========================================")
        consecutiveData = input.get()
        searchword, currentPage, totalPage = consecutiveData.split('+')

        url = momoMallBrowser.keywordResourcePair._momoMallKeywordUrlPair[
            searchword]

        # 建立browser的代碼放進while True裡面,就可以避免「同一個瀏覽器」持續拜訪網頁時,被拒絕的情況。
        for i in range(4):
            try:
                timeSleepFour()

                browser = momoMallBrowser.intersectionForCrawl(
                    folderWorker.objective)

                timeSleepRandomly()

                browserSetWindowSize(browser, horizon=1920, vertical=1080)
                timeSleepOne()

                browser.visit(url)

                browserWaitTime(browser)
                timeSleepTwo()

                #點擊「準確度」,頁數跳至第1頁
                try:
                    buyingTendency = momoMallBrowser.browserClickSearchType(
                        browser, 1)
                    browserWaitTime(browser)
                    timeSleepTwo()
                except AttributeError as e:
                    print(
                        f"{thisPID}__{getPageInARowAdvanced_proc}  {searchword} 在第{currentPage}頁點擊準確度有問題。",
                        e)
                    print(
                        f"{thisPID}__{getPageInARowAdvanced_proc}  重建browser物件,進行再處理 {i} 次!"
                    )
                    browserQuit(browser, thisPID, getPageInARowAdvanced_proc)
                    timeSleepFour()
                    soup = ""
                    continue

                # 點擊至正確的頁數
                momoMallBrowser.browserClickPageNumber(browser, currentPage,
                                                       totalPage, searchword)

                tempHtml = browser.html
                timeSleepRandomly()

                #擬人
                momoMallBrowser.humanSimulate(browser)

                soup = BeautifulSoup(tempHtml, 'lxml')
                # print(f"讀取{searchword}第 {currentPage} 頁,成功!")
                break
            except (ConnectionRefusedError, TimeoutException,
                    WebDriverException) as e:
                print(
                    f"{thisPID}__{getPageInARowAdvanced_proc} 讀取 {searchword} 第 {currentPage} 頁有問題。",
                    e)
                print(
                    f"{thisPID}__{getPageInARowAdvanced_proc} 重建browser物件,進行再處理 {i} 次!"
                )
                browserQuit(browser, thisPID, getPageInARowAdvanced_proc)
                timeSleepFour()
                timeSleepRandomly()
                soup = ""
            # else:
            #     print(f"讀取{searchword}第 {page} 頁,成功!")

        if not soup:
            errorMessage = f"{url}__{currentPage}__" + "\n"
            folderWorker.writeOutFile(
                f"{folderWorker._BASE_PATH}/dataMunging/{folderWorker.objectiveFolder}/{folderWorker.objective}/badRequest",
                f"badRequest_{searchword}.txt",
                errorMessage,
                writeOutType="a")

        folderWorker.writeOutFile(
            f"{folderWorker._BASE_PATH}/dataMunging/{folderWorker.objectiveFolder}/{folderWorker.objective}/{searchword}",
            f"{currentPage}_{totalPage}_{searchword}.txt", soup)

        # print(f'{thisPID}  成功寫出  {searchword}  第{currentPage}頁,總共{totalPage} 頁。')

        browserQuit(browser, thisPID, getPageInARowAdvanced_proc)

        input.task_done()  #通知main process此次的input處理完成!
        end = timeCalculate()
Example #19
0
def getPageInARow(input, url, firstPage, topTabList, elementUrl,
                  objectiveFolder, objective, *args):
    begin = timeCalculate()
    thisPID = os.getpid()
    while True:
        print(thisPID, "===========================================")
        searchword = input.get()

        mkdirForRawData(objectiveFolder,
                        objective,
                        "google",
                        keyword=searchword)
        browser = buildSplinterBrowserHeadless("chrome")

        browser.visit(url)
        browserWaitTime(browser)

        searchwordKeyInAndEnter(browser, searchword)
        browser.driver.set_window_size(1024, 768)

        forSureNws = findOutNws(browser, topTabList)
        keyNews = [key for key in forSureNws if forSureNws[key] == '新聞'].pop()
        # 擬人化mouse_over要排除新聞tab
        topTabList.remove(int(keyNews))

        print(f"點擊 topTabList {keyNews} 去到 新聞頁")
        #點擊新聞tab
        browser.find_by_xpath(
            f'//*[@id="hdtb-msb-vis"]/div[{keyNews}]/a').click()
        timeSleepRandomly()

        newsDict = {}
        newsDictInner = {}
        while True:
            print(f"進行 {searchword} 第", firstPage, "頁")
            elementUrlExtract(browser, firstPage, topTabList, elementUrl,
                              newsDictInner, searchword)
            judgment = judgeNextPage(browser, searchword)
            if judgment:
                print(f"『{searchword}』 仍有下一頁,繼續爬取!")
                firstPage += 1
                pass
            else:
                browser.quit()
                break

        timeStamp = timeStampGenerator()
        newsTotalNum = len(newsDictInner)
        newsDict["dateTime"] = timeStamp
        newsDict["keyword"] = searchword
        newsDict["newsTotalNum"] = newsTotalNum
        newsDict["newsUrl"] = newsDictInner

        with open(
                f"{_BASE_PATH}/dataMunging/{objectiveFolder}/{objective}/google/{searchword}/google_{timeStamp}_{newsTotalNum}_{searchword}.json",
                'w',
                encoding='utf-8') as f:
            json.dump(newsDict, f, indent=2, ensure_ascii=False)
        print(
            f'{thisPID}  成功寫出  google_{timeStamp}_{newsTotalNum}_{searchword}.json '
        )

        input.task_done()
        end = timeCalculate()
        print(f'{thisPID}_getPageInARaw 累計耗時:{end-begin} 秒')
Example #20
0
def getPageInARow(input, output, folderWorker, momoMallBrowser):
    thisPID = os.getpid()
    while True:
        print(thisPID, "===========================================")
        searchword = input.get()
        print('getPageInARow is in new process %s, %s ' %
              (getPageInARow_proc, thisPID))
        folderWorker.eraseRawData(searchword)
        folderWorker.mkdirForRawData(searchword)

        url = momoMallBrowser.keywordResourcePair._momoMallKeywordUrlPair[
            searchword]

        # 建立browser的代碼放進while True裡面,就可以避免「同一個瀏覽器」持續拜訪網頁時,被拒絕的情況。
        for i in range(4):
            try:
                timeSleepOne()
                timeSleepRandomly()

                browser = momoMallBrowser.intersectionForCrawl(
                    folderWorker.objective)

                timeSleepRandomly()

                browser.visit(url)

                browserWaitTime(browser)
                timeSleepTwo()

                #點擊「準確度」,頁數跳至第1頁
                try:
                    buyingTendency = momoMallBrowser.browserClickSearchType(
                        browser, 1)
                    browserWaitTime(browser)
                    timeSleepTwo()
                except AttributeError as e:
                    print(
                        f"{thisPID}__{getPageInARow_proc}  {searchword} 第1頁 點擊準確度有問題。",
                        e)
                    print(
                        f"{thisPID}__{getPageInARow_proc}  重建browser物件,進行再處理 {i} 次!"
                    )
                    browserQuit(browser, thisPID, getPageInARow_proc)
                    timeSleepFour()
                    soup = ""
                    continue

                tempHtml = browser.html

                timeSleepRandomly()
                soup = BeautifulSoup(tempHtml, 'lxml')
                print(
                    f"-----------------讀取{searchword}_{buyingTendency}第 1 頁-----------------成功!"
                )

                try:
                    ## current page and total page '頁數5/286'

                    pageState = browser.find_by_xpath(
                        '//*[@id="bt_2_layout_Content"]/div[2]/dl/dt/span')
                    totalPage = int(pageState.text.split('/')[1])
                    currentPage = int(
                        numsHandler.searchFloatNums(
                            pageState.text.split('/')[0]))
                    print(
                        f"-----------------讀取{searchword}_{buyingTendency} 總頁數-----------------成功!"
                    )
                except AttributeError as e:
                    print(f"getPageInARow __{searchword}__出錯", e, "重抓一次!")
                    # 讓程式強制停下來 # 觀察下來,「raise」只會讓當前執行的process停下來,並不會讓「整體」process停下來。
                    # 因此不適合用「raise」。
                    # raise
                    currentPage = 1  # 自訂
                    totalPage = 3  # 自訂
                    continue
                break
            except (ConnectionRefusedError, TimeoutException,
                    WebDriverException) as e:
                print(
                    f"{thisPID}__{getPageInARow_proc}  讀取{searchword}第 1 頁有問題。",
                    e)
                print(
                    f"{thisPID}__{getPageInARow_proc}  重建browser物件,進行再處理 {i} 次!"
                )
                browserQuit(browser, thisPID, getPageInARow_proc)
                timeSleepFour()
                timeSleepRandomly()
                soup = ""
            except StaleElementReferenceException as e:
                print(
                    "----------------StaleElementReferenceException----------------"
                )
                print(
                    f"{thisPID}__{getPageInARow_proc}  讀取{searchword}第 1 頁有問題。",
                    e)
                print(
                    f"{thisPID}__{getPageInARow_proc}  重建browser物件,進行再處理 {i} 次!"
                )
                browserQuit(browser, thisPID, getPageInARow_proc)
                timeSleepFour()
                timeSleepRandomly()
                soup = ""

        if not soup:
            errorMessage = f"{url}__{currentPage}__" + "\n"
            folderWorker.writeOutFile(
                f"{folderWorker._BASE_PATH}/dataMunging/{folderWorker.objectiveFolder}/{folderWorker.objective}/badRequest",
                f"badRequest_{searchword}.txt",
                errorMessage,
                writeOutType="a")

        folderWorker.writeOutFile(
            f"{folderWorker._BASE_PATH}/dataMunging/{folderWorker.objectiveFolder}/{folderWorker.objective}/{searchword}",
            f"1_{totalPage}_{searchword}.txt", soup)

        print(f'成功寫出  {searchword}  第 {currentPage} 頁')

        print('------接下來要處理 ' + searchword + ' 的頁數---------', totalPage, '頁')

        browserQuit(browser, thisPID, getPageInARow_proc)

        # 休息久一點,讓所有searchword的第一頁都有被讀到。
        timeSleepEight()
        timeSleepEight()
        timeSleepEight()

        for num in range(2, totalPage + 1):
            strNum = str(num)
            consecutiveData = searchword + "+" + strNum + "+" + str(totalPage)
            output.put(consecutiveData)
            # print(f'這裡是getPageInARow,準備送給  getPageInARowAdvanced  處理:  {searchword} 的 第 {strNum} 頁,總共{totalPage}')
            # print()

        input.task_done()  #通知main process此次的input處理完成!
        end = timeCalculate()
        print(f'{thisPID}__getPageInARow 累計耗時:{end-begin} 秒')
Example #21
0
            print(
                f"kill {thisPID}__{getPageInARowAdvanced_proc} >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>"
            )
            os.kill(thisPID, signal.SIGKILL)
        input.task_done()  #通知main process此次的input處理完成!
        end = timeCalculate()
        print(f'{thisPID}__getPageInARowAdvanced 累計耗時:{end-begin} 秒')


if __name__ == '__main__':

    objectiveFolder = "rawData"

    objective = "momo"

    begin = timeCalculate()

    print('start in main process %s' % os.getpid())

    eraseRawData(objectiveFolder, objective, "badRequest")
    mkdirForRawData(objectiveFolder, objective, "badRequest")
    print(
        '-------------------------------------------------------------------------'
    )

    # 共同佇列宣告
    searchword_queue = mp.JoinableQueue()
    url_queue = mp.JoinableQueue()

    # 啟動進程
    Process_1 = []
Example #22
0
from libs.manipulateDir import listSecondDirBelowFiles
from libs.manipulateDir import mkdirForRawData
from libs.manipulateDir import eraseRawData
from libs.timeWidget import timeCalculate
from libs.timeWidget import timeStampGenerator

if __name__ == '__main__':

    objectiveFolder = "rawData"

    objective = "news"

    dirRoute = f"{_BASE_PATH}/dataMunging/{objectiveFolder}/{objective}/google"
    dirRouteWriteOut = f"{_BASE_PATH}/dataMunging/{objectiveFolder}/{objective}/newsIntegration"

    begin = timeCalculate()

    # eraseRawData(objectiveFolder, objective, "newsIntegration")
    mkdirForRawData(objectiveFolder, objective, "newsIntegration")

    dirRouteToFiles = listSecondDirBelowFiles(dirRoute)

    newsDict = {}
    newsDictInner = {}
    for file in dirRouteToFiles:
        with open(file) as f:
            inn = json.load(f)
        newsDictInner.update(inn['newsUrl'])

    timeStamp = timeStampGenerator()
    newsTotalNum = len(newsDictInner)
Example #23
0
def getPageInARowAdvanced(input, objectiveFolder, objective):
    thisPID = os.getpid()
    while True:
        print(thisPID, "===========================================")
        consecutiveUrl = input.get()
        searchword, page, totalPage, url = consecutiveUrl.split('+')
        # print(url)
        print(
            f"{thisPID}__{getPageInARowAdvanced_proc} 開始處理 {searchword} 的第 {page} 頁:"
        )

        # 建立browser的代碼放進while True裡面,就可以避免「同一個瀏覽器」持續拜訪網頁時,被拒絕的情況。
        for i in range(3):
            try:
                timeSleepFour()

                browser = buildSplinterBrowserHeadless('chrome')

                timeSleepRandomly()

                browser.visit(url)

                browserWaitTime(browser)
                timeSleepTwo()

                tempHtml = browser.html
                timeSleepRandomly()

                soup = BeautifulSoup(tempHtml, 'html.parser')
                print(f"讀取{searchword}第 {page} 頁,成功!")
                break
            except (ConnectionRefusedError, TimeoutException,
                    WebDriverException) as e:
                print(
                    f"{thisPID}__{getPageInARowAdvanced_proc} 讀取 {searchword} 第 {page} 頁有問題。",
                    e)
                print(
                    f"{thisPID}__{getPageInARowAdvanced_proc} 重建browser物件,進行再處理 {i} 次!"
                )
                timeSleepFour()
                timeSleepRandomly()
                soup = ""
            # else:
            #     print(f"讀取{searchword}第 {page} 頁,成功!")

        if not soup:
            badRequestRoute = f"{_BASE_PATH}/dataMunging/{objectiveFolder}/{objective}/badRequest"
            with open(f"{badRequestRoute}/badRequest_{searchword}.txt",
                      "a",
                      newline='',
                      encoding='utf-8') as f:  # newline沒作用...
                errorMessage = url + "\n"
                f.write(errorMessage)  #writelines作用在errorMessage是list時

        with open(
                f"{_BASE_PATH}/dataMunging/{objectiveFolder}/{objective}/{searchword}/{page}_{totalPage}_{searchword}.txt",
                'w',
                encoding='utf-8') as f:
            f.write(str(soup))
        print()
        print(f'{thisPID}  成功寫出  {searchword}  第{page}頁,總共{totalPage} 頁。')

        try:
            browser.quit()
            print(
                f"成功關閉 browser{thisPID}__{getPageInARowAdvanced_proc}++++++++++++++++++++++++++++++"
            )
        except:
            print(f"放棄 {thisPID}__{getPageInARowAdvanced_proc} 這個browser。")
            print(
                f"kill {thisPID}__{getPageInARowAdvanced_proc} >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>"
            )
            os.kill(thisPID, signal.SIGKILL)
        input.task_done()  #通知main process此次的input處理完成!
        end = timeCalculate()
        print(f'{thisPID}__getPageInARowAdvanced 累計耗時:{end-begin} 秒')
Example #24
0
def detailPageInARow(input, headers, objectiveFolder, objective, *args):
    """
    As many as 28,000 detail urls we are supposed to crawl would inevitalby leave some processes to fail to get the correct responses.
    As such, we should extend more time while crawling , or establish exception handler in porgrams.
    
    """
    # begin = timeCalculate()
    thisPID = os.getpid()
    while True:
        # print(thisPID,"===========================================")

        consecutiveUrl = input.get()
        searchword, url, txtFileRoute = consecutiveUrl.split("+")

        # print('detailPageInARow is in new process %s, %s ' % (detailPageInARow_proc, thisPID))
        # print()

        for i in range(3):
            try:
                timeSleepOne()
                res = requests.get(url, headers=headers)
                res.encoding = 'utf-8'
                timeSleepRandomly()
                soup = BeautifulSoup(res.text, 'html.parser')
                break
            except requests.exceptions.ConnectionError as e:
                print(url, "發生問題。", e)
                print()
                timeSleepRandomly()
                timeSleepTwo()
                soup = ""

        judgeSoup(soup, searchword, url, txtFileRoute)
        # if not soup:
        #   badRequestRoute = "/".join(txtFileRoute.split("/")[:-3]) + "/badRequest"
        #   with open(f"{badRequestRoute}/badRequest_{searchword}.txt", "a",  newline='', encoding='utf-8')as f: # newline沒作用...
        #       errorMessage = url + "\n"
        #       f.write(errorMessage)   #writelines作用在errorMessage是list時
        # elif soup.select_one('head').text.strip() == 'Service Unavailable':
        #   """

        #   「
        #   <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN""http://www.w3.org/TR/html4/strict.dtd">

        #   <html><head><title>Service Unavailable</title>
        #   <meta content="text/html; charset=utf-8" http-equiv="Content-Type"/></head>
        #   <body><h2>Service Unavailable</h2>
        #   <hr/><p>HTTP Error 503. The service is unavailable.</p>
        #   </body></html>
        #   」

        #   """
        #   soup = ""

        with open(txtFileRoute, 'w', encoding='utf-8') as f:
            f.write(str(soup))

        fileName = txtFileRoute.split("/")[-1]
        productIndex = fileName.split("_")[0]
        productNums = fileName.split("_")[1]
        print(
            f"{thisPID}__成功寫出  {searchword}  detail頁, 第 {productIndex} 項, 共 {productNums} 項。"
        )

        timeSleepRandomly()

        # print('這裡是 detailPageInARow 完成: ' + fileName + " 的爬取。")
        end = timeCalculate()
        # print('detailPageInARow 累計耗時:{0} 秒'.format(end-begin))
        input.task_done()