コード例 #1
0
ファイル: googleNews.py プロジェクト: ilyi1116/iSelect3C
def getPageInARow(url, searchword, firstPage, topTabList, elementUrl):

    browser = buildSplinterBrowser("chrome")
     
    browser.visit(url)
    browserWaitTime(browser)

    searchwordKeyInAndEnter(browser, searchword)
    browser.driver.set_window_size(1024,768)

    forSureNws = findOutNws(browser, topTabList)

    keyNews = [key for key in forSureNws if forSureNws[key] == '新聞'].pop()
    # 擬人化mouse_over要排除新聞tab
    topTabList.remove(int(keyNews))

    print(f"點擊 {keyNews} 去到 新聞頁")
    #點擊新聞tab
    browser.find_by_xpath(f'//*[@id="hdtb-msb-vis"]/div[{keyNews}]/a').click()
    timeSleepRandomly()

    newsDict = {}
    newsDictInner = {}
    while True:
        print(f"進行 {searchword} 第", firstPage, "頁")
        elementUrlExtract(browser, topTabList, elementUrl, newsDictInner)
        judgment = judgeNextPage(browser)
        if judgment:
            print("仍有下一頁,繼續爬取!")
            firstPage += 1
            pass
        else:
            browser.quit()
            break

    
    newsDict["dateTime"] = timeStampGenerator()
    newsDict["keyword"] = searchword
    newsDict["newsTotalNum"] = len(newsDictInner)
    newsDict["newsUrl"] = newsDictInner

    return newsDict
コード例 #2
0
def getPageInARowAdvanced(input, objectiveFolder, objective):
    thisPID = os.getpid()
    while True:
        print(thisPID, "===========================================")
        consecutiveUrl = input.get()
        searchword, page, totalPage, url = consecutiveUrl.split('+')
        # print(url)
        print(
            f"{thisPID}__{getPageInARowAdvanced_proc} 開始處理 {searchword} 的第 {page} 頁:"
        )

        # 建立browser的代碼放進while True裡面,就可以避免「同一個瀏覽器」持續拜訪網頁時,被拒絕的情況。
        for i in range(3):
            try:
                timeSleepFour()

                browser = buildSplinterBrowserHeadless('chrome')

                timeSleepRandomly()

                browser.visit(url)

                browserWaitTime(browser)
                timeSleepTwo()

                tempHtml = browser.html
                timeSleepRandomly()

                soup = BeautifulSoup(tempHtml, 'html.parser')
                print(f"讀取{searchword}第 {page} 頁,成功!")
                break
            except (ConnectionRefusedError, TimeoutException,
                    WebDriverException) as e:
                print(
                    f"{thisPID}__{getPageInARowAdvanced_proc} 讀取 {searchword} 第 {page} 頁有問題。",
                    e)
                print(
                    f"{thisPID}__{getPageInARowAdvanced_proc} 重建browser物件,進行再處理 {i} 次!"
                )
                timeSleepFour()
                timeSleepRandomly()
                soup = ""
            # else:
            #     print(f"讀取{searchword}第 {page} 頁,成功!")

        if not soup:
            badRequestRoute = f"{_BASE_PATH}/dataMunging/{objectiveFolder}/{objective}/badRequest"
            with open(f"{badRequestRoute}/badRequest_{searchword}.txt",
                      "a",
                      newline='',
                      encoding='utf-8') as f:  # newline沒作用...
                errorMessage = url + "\n"
                f.write(errorMessage)  #writelines作用在errorMessage是list時

        with open(
                f"{_BASE_PATH}/dataMunging/{objectiveFolder}/{objective}/{searchword}/{page}_{totalPage}_{searchword}.txt",
                'w',
                encoding='utf-8') as f:
            f.write(str(soup))
        print()
        print(f'{thisPID}  成功寫出  {searchword}  第{page}頁,總共{totalPage} 頁。')

        try:
            browser.quit()
            print(
                f"成功關閉 browser{thisPID}__{getPageInARowAdvanced_proc}++++++++++++++++++++++++++++++"
            )
        except:
            print(f"放棄 {thisPID}__{getPageInARowAdvanced_proc} 這個browser。")
            print(
                f"kill {thisPID}__{getPageInARowAdvanced_proc} >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>"
            )
            os.kill(thisPID, signal.SIGKILL)
        input.task_done()  #通知main process此次的input處理完成!
        end = timeCalculate()
        print(f'{thisPID}__getPageInARowAdvanced 累計耗時:{end-begin} 秒')
コード例 #3
0
def getPageInARow(input, output, keywordUrlPair, objectiveFolder, objective):
    thisPID = os.getpid()
    while True:
        print(thisPID, "===========================================")
        searchword = input.get()
        print('getPageInARow is in new process %s, %s ' %
              (getPageInARow_proc, thisPID))
        print()
        eraseRawData(objectiveFolder, objective, searchword)
        mkdirForRawData(objectiveFolder, objective, searchword)

        url = keywordUrlPair[searchword]

        # 建立browser的代碼放進while True裡面,就可以避免「同一個瀏覽器」持續拜訪網頁時,被拒絕的情況。
        for i in range(3):
            try:
                timeSleepOne()
                timeSleepRandomly()

                browser = buildSplinterBrowserHeadless('chrome')

                timeSleepRandomly()

                browser.visit(url)

                browserWaitTime(browser)
                timeSleepTwo()

                tempHtml = browser.html

                timeSleepRandomly()
                soup = BeautifulSoup(tempHtml, 'html.parser')
                print(
                    f"讀取{searchword}第 1 頁>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>成功!"
                )
                break
            except (ConnectionRefusedError, TimeoutException,
                    WebDriverException) as e:
                print(
                    f"{thisPID}__{getPageInARow_proc}  讀取{searchword}第 1 頁有問題。",
                    e)
                print(
                    f"{thisPID}__{getPageInARow_proc}  重建browser物件,進行再處理 {i} 次!"
                )
                timeSleepFour()
                timeSleepRandomly()
                soup = ""
            # else:
            #     print(f"讀取{searchword}第 1 頁>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>成功!")

        try:
            totalPage = interDiv(searchNums(soup.select_one('.totalTxt').text),
                                 30)
        except AttributeError as e:
            print("getPageInARow 出錯", e)
            # 讓程式強制停下來
            raise

        print('------接下來要處理 ' + searchword + ' 的頁數---------', totalPage, '頁')
        print()

        with open(
                f"{_BASE_PATH}/dataMunging/{objectiveFolder}/{objective}/{searchword}/1_{totalPage}_{searchword}.txt",
                'w',
                encoding='utf-8') as f:
            f.write(str(soup))
        print()
        print(f'成功寫出  {searchword}  第 1 頁')

        i_browser = 1
        try:
            browser.quit()
            print(
                f"成功關閉 browser{getPageInARow_proc}++++++++++++++++++++++++++++++"
            )
        except:
            print(
                f"放棄 {thisPID}__{getPageInARow_proc} 的 第{i_browser}個browser。")
            i_browser += 1
            print(
                f"kill {thisPID}__{getPageInARow_proc} >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>"
            )
            os.kill(thisPID, signal.SIGKILL)

        # 休息久一點,讓所有searchword的第一頁都有被讀到。
        timeSleepEight()
        timeSleepEight()

        for num in range(2, totalPage + 1):
            strNum = str(num)
            consecutiveData = searchword + "+" + strNum + "+" + str(
                totalPage) + "+" + re.sub(r"curPage=1", f"curPage={strNum}",
                                          url)
            output.put(consecutiveData)
            # print(f'這裡是getPageInARow,準備送給  getPageInARowAdvanced  處理:  {searchword} 的 第 {strNum} 頁,總共{totalPage}')
            print()
        input.task_done()  #通知main process此次的input處理完成!
        end = timeCalculate()
        print(f'{thisPID}__getPageInARow 累計耗時:{end-begin} 秒')
コード例 #4
0
    def humanSimulate(cls, browser):
        """
        之所以click()不成功是因為mouse_over後,browser視窗看不到要點擊的xpath了!
        
        WebDriverException: Message: unknown error: Element <a class="selected">...</a> is not clickable at point (338, 13). Other element would receive the click: <div id="bt_0_002_01" class="">...</div>
        (Session info: chrome=80.0.3987.122)
        (Driver info: chromedriver=2.36.540471 (9c759b81a907e70363c6312294d30b6ccccc2752),platform=Linux 4.15.0-65-generic x86_64)
        
        但依舊可以用boolean的方式判斷;不過視窗的移動,不影響mouse_over()
        if browser.is_element_present_by_xpath('//*[@id="bt_2_layout_Content"]/div[2]/ul/li[1]/a'):
            print(1)  
        """
        searchTypeList = [row for row in range(1, 5)]
        pageList = [row for row in range(1, 13)]  # 頁碼欄的第一頁只有12項
        brandAndClassList = [row for row in range(2)]

        randomTypeNum = random.choice(searchTypeList)
        randomPageNum = random.choice(pageList)
        randomBrandClassNum = random.choice(brandAndClassList)

        try:
            try:
                # 針對頁碼,最多12項;在置底頁時,選項不會12項足項
                browser.find_by_xpath(
                    f'//*[@id="bt_2_layout_Content"]/div[2]/ul/li[{randomPageNum}]/a'
                ).mouse_over()
                browserWaitTime(browser)
            except AttributeError as e:  # 找不到element 來mouse_over() ;
                print("頁碼不足12項___擬人化操作找不到 Element。", e)
                browserWaitTime(browser)

            browser.execute_script(
                'window.scrollTo(0, document.body.scrollHeight);')
            timeSleepOne()
            # 針對準確度...價格等4項
            browser.find_by_xpath(
                f'//*[@id="bt_2_layout_Content"]/div[3]/span/ul/li[{randomTypeNum}]'
            ).mouse_over()
            timeSleepOne()

            #針對商標與商品分類選單
            if randomBrandClassNum:
                if browser.is_element_present_by_xpath(
                        '//*[@id="categoriesBtn"]'):
                    browser.find_by_xpath(
                        '//*[@id="categoriesBtn"]').mouse_over()
                elif browser.is_element_present_by_xpath(
                        '//*[@id="bt_0_layout_b203"]'):
                    browser.find_by_xpath(
                        '//*[@id="bt_0_layout_b203"]').mouse_over()
            else:
                if browser.is_element_present_by_xpath(
                        '//*[@id="bt_0_layout_b203"]'):
                    browser.find_by_xpath(
                        '//*[@id="bt_0_layout_b203"]').mouse_over()
                elif browser.is_element_present_by_xpath(
                        '//*[@id="categoriesBtn"]'):
                    browser.find_by_xpath(
                        '//*[@id="categoriesBtn"]').mouse_over()

            timeSleepOne()
            browser.execute_script('window.scrollTo(0,0);')

        except AttributeError as e:  # 找不到element 來mouse_over() ;
            print("擬人化操作找不到 Element。", e)
コード例 #5
0
    def browserClickPageNumber(cls, browser, currentPage, totalPage,
                               searchword):
        """
        #點擊頁數
        # 預設
        # 1 2 3 4 5 6 7 8 9 10 >> >|
        # 頂 上10頁
        # 1                                      14
        # |< << 11 12 13 14 15 16 17 18 19 20 >> >|
        
        # 置底
        # |< << 281 282 283 284

        # accuratePage =  browser.find_by_xpath('//*[@id="bt_2_layout_Content"]/div[2]/ul/li[8]/a')
        accuratePage =  browser.find_by_xpath('//*[@id="bt_2_layout_Content"]/div[2]/ul/li[1]/a')
        accuratePage.text
        """

        currentPageNum = int(currentPage)
        totalPageNum = int(totalPage)
        halfTotalPageNum = totalPageNum // 2

        if currentPageNum > halfTotalPageNum and currentPageNum > 10:
            #去到置底頁
            browser.find_by_xpath(
                f'//*[@id="bt_2_layout_Content"]/div[2]/ul/li[12]/a').click()
            timeSleepOne()

            if currentPageNum != totalPageNum and currentPageNum // 10 == totalPageNum // 10:
                if currentPageNum % 10 != 0:
                    # 13、18
                    clickBeforeTimes = 0
                elif currentPageNum % 10 == 0:
                    # 290、299
                    clickBeforeTimes = 1

                #反方向點擊到正確頁數的畫面
                for i in range(clickBeforeTimes):
                    browser.find_by_xpath(
                        f'//*[@id="bt_2_layout_Content"]/div[2]/ul/li[2]/a'
                    ).click()
                    # timeSleepRandomly()
                    # timeSleepOne()
                    browserWaitTime(browser)

            elif currentPageNum != totalPageNum and currentPageNum // 10 < totalPageNum // 10:

                if currentPageNum % 10 != 0 and totalPageNum % 10 == 0:  # and totalPageNum - currentPageNum < 10:
                    # 281、290
                    # 271、290
                    # 11、30
                    clickBeforeTimes = (totalPageNum //
                                        10) - (currentPageNum // 10) - 1

                elif currentPageNum % 10 != 0 and totalPageNum % 10 != 0:  # and totalPageNum - currentPageNum >= 10:
                    # 271、291
                    # 18、23
                    clickBeforeTimes = (totalPageNum //
                                        10) - (currentPageNum // 10)

                elif currentPageNum % 10 == 0 and totalPageNum % 10 != 0:
                    # 270、291
                    clickBeforeTimes = (totalPageNum //
                                        10) - (currentPageNum // 10) + 1

                elif currentPageNum % 10 == 0 and totalPageNum % 10 == 0:
                    # 270、290
                    clickBeforeTimes = (totalPageNum //
                                        10) - (currentPageNum // 10)

                #反方向點擊到正確頁數的畫面
                for i in range(clickBeforeTimes):
                    browser.find_by_xpath(
                        f'//*[@id="bt_2_layout_Content"]/div[2]/ul/li[2]/a'
                    ).click()
                    # timeSleepRandomly()
                    # timeSleepOne()
                    browserWaitTime(browser)

            #點擊到正確頁碼
            judgeNum = currentPageNum % 10
            if judgeNum:
                clickNum = judgeNum + 2
            elif judgeNum == 0:
                clickNum = judgeNum + 12
            print(
                f"反方向__{searchword}__目標頁碼:{currentPage}, 點擊項次:{clickNum}, 總頁數:{totalPage}"
            )
            browser.find_by_xpath(
                f'//*[@id="bt_2_layout_Content"]/div[2]/ul/li[{clickNum}]/a'
            ).click()
            accuratePage = browser.find_by_xpath(
                f'//*[@id="bt_2_layout_Content"]/div[2]/ul/li[{clickNum}]/a'
            ).text

            print(
                f"反方向__{searchword}__目標頁碼:{currentPage}, 點擊頁碼:{accuratePage}, 總頁數:{totalPage}"
            )

        else:
            if currentPageNum <= 10:
                browser.find_by_xpath(
                    f'//*[@id="bt_2_layout_Content"]/div[2]/ul/li[{currentPageNum}]/a'
                ).click()
                accuratePage = browser.find_by_xpath(
                    f'//*[@id="bt_2_layout_Content"]/div[2]/ul/li[{currentPageNum}]/a'
                ).text

            elif 11 <= currentPageNum <= 20:
                #去到11~20頁
                browser.find_by_xpath(
                    f'//*[@id="bt_2_layout_Content"]/div[2]/ul/li[11]/a'
                ).click()
                clickNum = currentPageNum - 10 + 2
                timeSleepOne()
                browser.find_by_xpath(
                    f'//*[@id="bt_2_layout_Content"]/div[2]/ul/li[{clickNum}]/a'
                ).click()
                accuratePage = browser.find_by_xpath(
                    f'//*[@id="bt_2_layout_Content"]/div[2]/ul/li[{clickNum}]/a'
                ).text
            else:
                #去到11~20頁
                browser.find_by_xpath(
                    f'//*[@id="bt_2_layout_Content"]/div[2]/ul/li[11]/a'
                ).click()

                if currentPageNum % 10 == 0:
                    # 電冰箱__目標頁碼:290, 點擊頁碼:300, 總頁數:921
                    clickNextTimes = currentPageNum // 10 - 1
                else:
                    # 冰箱__目標頁碼:292, 點擊頁碼:292, 總頁數:921
                    clickNextTimes = currentPageNum // 10

                #點擊到正確頁數的畫面
                for i in range(clickNextTimes - 1):  #扣1是因為已經「#去到11~20頁」
                    browser.find_by_xpath(
                        f'//*[@id="bt_2_layout_Content"]/div[2]/ul/li[13]/a'
                    ).click()
                    # timeSleepRandomly()
                    # timeSleepOne()
                    browserWaitTime(browser)

                #點擊到正確頁碼
                judgeNum = currentPageNum - (clickNextTimes * 10)
                if judgeNum:
                    clickNum = judgeNum + 2
                elif judgeNum == 0:
                    clickNum = judgeNum + 12
                browser.find_by_xpath(
                    f'//*[@id="bt_2_layout_Content"]/div[2]/ul/li[{clickNum}]/a'
                ).click()
                accuratePage = browser.find_by_xpath(
                    f'//*[@id="bt_2_layout_Content"]/div[2]/ul/li[{clickNum}]/a'
                ).text

            print(
                f"{searchword}__目標頁碼:{currentPage}, 點擊頁碼:{accuratePage}, 總頁數:{totalPage}"
            )
コード例 #6
0
def getPageInARow(input, url, firstPage, topTabList, elementUrl,
                  objectiveFolder, objective, *args):
    begin = timeCalculate()
    thisPID = os.getpid()
    while True:
        print(thisPID, "===========================================")
        searchword = input.get()

        mkdirForRawData(objectiveFolder,
                        objective,
                        "google",
                        keyword=searchword)
        browser = buildSplinterBrowserHeadless("chrome")

        browser.visit(url)
        browserWaitTime(browser)

        searchwordKeyInAndEnter(browser, searchword)
        browser.driver.set_window_size(1024, 768)

        forSureNws = findOutNws(browser, topTabList)
        keyNews = [key for key in forSureNws if forSureNws[key] == '新聞'].pop()
        # 擬人化mouse_over要排除新聞tab
        topTabList.remove(int(keyNews))

        print(f"點擊 topTabList {keyNews} 去到 新聞頁")
        #點擊新聞tab
        browser.find_by_xpath(
            f'//*[@id="hdtb-msb-vis"]/div[{keyNews}]/a').click()
        timeSleepRandomly()

        newsDict = {}
        newsDictInner = {}
        while True:
            print(f"進行 {searchword} 第", firstPage, "頁")
            elementUrlExtract(browser, firstPage, topTabList, elementUrl,
                              newsDictInner, searchword)
            judgment = judgeNextPage(browser, searchword)
            if judgment:
                print(f"『{searchword}』 仍有下一頁,繼續爬取!")
                firstPage += 1
                pass
            else:
                browser.quit()
                break

        timeStamp = timeStampGenerator()
        newsTotalNum = len(newsDictInner)
        newsDict["dateTime"] = timeStamp
        newsDict["keyword"] = searchword
        newsDict["newsTotalNum"] = newsTotalNum
        newsDict["newsUrl"] = newsDictInner

        with open(
                f"{_BASE_PATH}/dataMunging/{objectiveFolder}/{objective}/google/{searchword}/google_{timeStamp}_{newsTotalNum}_{searchword}.json",
                'w',
                encoding='utf-8') as f:
            json.dump(newsDict, f, indent=2, ensure_ascii=False)
        print(
            f'{thisPID}  成功寫出  google_{timeStamp}_{newsTotalNum}_{searchword}.json '
        )

        input.task_done()
        end = timeCalculate()
        print(f'{thisPID}_getPageInARaw 累計耗時:{end-begin} 秒')
コード例 #7
0
from libs.splinterBrowser import browserWaitTime
from libs.timeWidget import timeCalculate
from libs.timeWidget import timeStampGenerator
from libs.manipulateDir import mkdirForRawData
from libs.manipulateDir import eraseRawData

if __name__ == '__main__':

    objectiveFolder = "rawData"

    objective = "observationStation"

    begin = timeCalculate()

    browser = buildSplinterBrowser("chrome")
    browserWaitTime(browser)

    browser.visit("http://e-service.cwb.gov.tw/HistoryDataQuery/")

    #等待地圖的JS出來
    browser.is_element_present_by_xpath('//*[@id="con_r"]/div/div[1]',
                                        wait_time=5)

    soup = BeautifulSoup(browser.html, "html.parser")

    browser.quit()
    print("==============================quit==============================")

    eraseRawData(objectiveFolder, objective, "overviewData")
    mkdirForRawData(objectiveFolder, objective, "overviewData")
コード例 #8
0
def getPageInARow(input, output, folderWorker, momoMallBrowser):
    thisPID = os.getpid()
    while True:
        print(thisPID, "===========================================")
        searchword = input.get()
        print('getPageInARow is in new process %s, %s ' %
              (getPageInARow_proc, thisPID))
        folderWorker.eraseRawData(searchword)
        folderWorker.mkdirForRawData(searchword)

        url = momoMallBrowser.keywordResourcePair._momoMallKeywordUrlPair[
            searchword]

        # 建立browser的代碼放進while True裡面,就可以避免「同一個瀏覽器」持續拜訪網頁時,被拒絕的情況。
        for i in range(4):
            try:
                timeSleepOne()
                timeSleepRandomly()

                browser = momoMallBrowser.intersectionForCrawl(
                    folderWorker.objective)

                timeSleepRandomly()

                browser.visit(url)

                browserWaitTime(browser)
                timeSleepTwo()

                #點擊「準確度」,頁數跳至第1頁
                try:
                    buyingTendency = momoMallBrowser.browserClickSearchType(
                        browser, 1)
                    browserWaitTime(browser)
                    timeSleepTwo()
                except AttributeError as e:
                    print(
                        f"{thisPID}__{getPageInARow_proc}  {searchword} 第1頁 點擊準確度有問題。",
                        e)
                    print(
                        f"{thisPID}__{getPageInARow_proc}  重建browser物件,進行再處理 {i} 次!"
                    )
                    browserQuit(browser, thisPID, getPageInARow_proc)
                    timeSleepFour()
                    soup = ""
                    continue

                tempHtml = browser.html

                timeSleepRandomly()
                soup = BeautifulSoup(tempHtml, 'lxml')
                print(
                    f"-----------------讀取{searchword}_{buyingTendency}第 1 頁-----------------成功!"
                )

                try:
                    ## current page and total page '頁數5/286'

                    pageState = browser.find_by_xpath(
                        '//*[@id="bt_2_layout_Content"]/div[2]/dl/dt/span')
                    totalPage = int(pageState.text.split('/')[1])
                    currentPage = int(
                        numsHandler.searchFloatNums(
                            pageState.text.split('/')[0]))
                    print(
                        f"-----------------讀取{searchword}_{buyingTendency} 總頁數-----------------成功!"
                    )
                except AttributeError as e:
                    print(f"getPageInARow __{searchword}__出錯", e, "重抓一次!")
                    # 讓程式強制停下來 # 觀察下來,「raise」只會讓當前執行的process停下來,並不會讓「整體」process停下來。
                    # 因此不適合用「raise」。
                    # raise
                    currentPage = 1  # 自訂
                    totalPage = 3  # 自訂
                    continue
                break
            except (ConnectionRefusedError, TimeoutException,
                    WebDriverException) as e:
                print(
                    f"{thisPID}__{getPageInARow_proc}  讀取{searchword}第 1 頁有問題。",
                    e)
                print(
                    f"{thisPID}__{getPageInARow_proc}  重建browser物件,進行再處理 {i} 次!"
                )
                browserQuit(browser, thisPID, getPageInARow_proc)
                timeSleepFour()
                timeSleepRandomly()
                soup = ""
            except StaleElementReferenceException as e:
                print(
                    "----------------StaleElementReferenceException----------------"
                )
                print(
                    f"{thisPID}__{getPageInARow_proc}  讀取{searchword}第 1 頁有問題。",
                    e)
                print(
                    f"{thisPID}__{getPageInARow_proc}  重建browser物件,進行再處理 {i} 次!"
                )
                browserQuit(browser, thisPID, getPageInARow_proc)
                timeSleepFour()
                timeSleepRandomly()
                soup = ""

        if not soup:
            errorMessage = f"{url}__{currentPage}__" + "\n"
            folderWorker.writeOutFile(
                f"{folderWorker._BASE_PATH}/dataMunging/{folderWorker.objectiveFolder}/{folderWorker.objective}/badRequest",
                f"badRequest_{searchword}.txt",
                errorMessage,
                writeOutType="a")

        folderWorker.writeOutFile(
            f"{folderWorker._BASE_PATH}/dataMunging/{folderWorker.objectiveFolder}/{folderWorker.objective}/{searchword}",
            f"1_{totalPage}_{searchword}.txt", soup)

        print(f'成功寫出  {searchword}  第 {currentPage} 頁')

        print('------接下來要處理 ' + searchword + ' 的頁數---------', totalPage, '頁')

        browserQuit(browser, thisPID, getPageInARow_proc)

        # 休息久一點,讓所有searchword的第一頁都有被讀到。
        timeSleepEight()
        timeSleepEight()
        timeSleepEight()

        for num in range(2, totalPage + 1):
            strNum = str(num)
            consecutiveData = searchword + "+" + strNum + "+" + str(totalPage)
            output.put(consecutiveData)
            # print(f'這裡是getPageInARow,準備送給  getPageInARowAdvanced  處理:  {searchword} 的 第 {strNum} 頁,總共{totalPage}')
            # print()

        input.task_done()  #通知main process此次的input處理完成!
        end = timeCalculate()
        print(f'{thisPID}__getPageInARow 累計耗時:{end-begin} 秒')
コード例 #9
0
def getPageInARowAdvanced(input, folderWorker, momoMallBrowser):
    """
    開始對POST網址進行splinter的點擊
    """
    thisPID = os.getpid()
    while True:
        # print(thisPID,"===========================================")
        consecutiveData = input.get()
        searchword, currentPage, totalPage = consecutiveData.split('+')

        url = momoMallBrowser.keywordResourcePair._momoMallKeywordUrlPair[
            searchword]

        # 建立browser的代碼放進while True裡面,就可以避免「同一個瀏覽器」持續拜訪網頁時,被拒絕的情況。
        for i in range(4):
            try:
                timeSleepFour()

                browser = momoMallBrowser.intersectionForCrawl(
                    folderWorker.objective)

                timeSleepRandomly()

                browserSetWindowSize(browser, horizon=1920, vertical=1080)
                timeSleepOne()

                browser.visit(url)

                browserWaitTime(browser)
                timeSleepTwo()

                #點擊「準確度」,頁數跳至第1頁
                try:
                    buyingTendency = momoMallBrowser.browserClickSearchType(
                        browser, 1)
                    browserWaitTime(browser)
                    timeSleepTwo()
                except AttributeError as e:
                    print(
                        f"{thisPID}__{getPageInARowAdvanced_proc}  {searchword} 在第{currentPage}頁點擊準確度有問題。",
                        e)
                    print(
                        f"{thisPID}__{getPageInARowAdvanced_proc}  重建browser物件,進行再處理 {i} 次!"
                    )
                    browserQuit(browser, thisPID, getPageInARowAdvanced_proc)
                    timeSleepFour()
                    soup = ""
                    continue

                # 點擊至正確的頁數
                momoMallBrowser.browserClickPageNumber(browser, currentPage,
                                                       totalPage, searchword)

                tempHtml = browser.html
                timeSleepRandomly()

                #擬人
                momoMallBrowser.humanSimulate(browser)

                soup = BeautifulSoup(tempHtml, 'lxml')
                # print(f"讀取{searchword}第 {currentPage} 頁,成功!")
                break
            except (ConnectionRefusedError, TimeoutException,
                    WebDriverException) as e:
                print(
                    f"{thisPID}__{getPageInARowAdvanced_proc} 讀取 {searchword} 第 {currentPage} 頁有問題。",
                    e)
                print(
                    f"{thisPID}__{getPageInARowAdvanced_proc} 重建browser物件,進行再處理 {i} 次!"
                )
                browserQuit(browser, thisPID, getPageInARowAdvanced_proc)
                timeSleepFour()
                timeSleepRandomly()
                soup = ""
            # else:
            #     print(f"讀取{searchword}第 {page} 頁,成功!")

        if not soup:
            errorMessage = f"{url}__{currentPage}__" + "\n"
            folderWorker.writeOutFile(
                f"{folderWorker._BASE_PATH}/dataMunging/{folderWorker.objectiveFolder}/{folderWorker.objective}/badRequest",
                f"badRequest_{searchword}.txt",
                errorMessage,
                writeOutType="a")

        folderWorker.writeOutFile(
            f"{folderWorker._BASE_PATH}/dataMunging/{folderWorker.objectiveFolder}/{folderWorker.objective}/{searchword}",
            f"{currentPage}_{totalPage}_{searchword}.txt", soup)

        # print(f'{thisPID}  成功寫出  {searchword}  第{currentPage}頁,總共{totalPage} 頁。')

        browserQuit(browser, thisPID, getPageInARowAdvanced_proc)

        input.task_done()  #通知main process此次的input處理完成!
        end = timeCalculate()