Python timeSleepTwo Exemples, libs.timeWidget.timeSleepTwo Python Exemples

Exemple #1

0

Afficher le fichier

    def requests(cls, url, headers):
        for i in range(3):
            try:
                timeSleepOne()
                res = requests.get(url, headers=headers)
                res.encoding = 'utf-8'
                timeSleepRandomly()

                soup = BeautifulSoup(res.text, 'html.parser')
                newsContent = [
                    textMiningRegex.discardSpace(
                        textMiningRegex.replaceEscapeAlphabet(row.text))
                    for row in soup.find("article").stripped_strings
                    if row != "" and not "googletag.cmd.push" in row
                    and not "function" in row
                ]
                videoLinkInContent = None  # 內文本身沒有影片
                break
            except requests.exceptions.ConnectionError as e:
                print(url, "發生問題。", e)
                print()
                timeSleepRandomly()
                timeSleepTwo()
                newsContent = None
                videoLinkInContent = None

        return videoLinkInContent, newsContent

Exemple #2

0

Afficher le fichier

    def requests(cls, url, headers):
        for i in range(3):
            try:
                timeSleepOne()
                res = requests.get(url, headers=headers)
                res.encoding = 'utf-8'
                timeSleepRandomly()

                soup = BeautifulSoup(res.text, 'html.parser')
                newsContent = [
                    textMiningRegex.discardSpace(
                        textMiningRegex.replaceEscapeAlphabet(row.text))
                    for row in soup.select_one(".article-body").select("p")
                    if row.text != ""
                ]
                videoLinkInContent = None  # 內文本身沒有影片

                break
            except requests.exceptions.ConnectionError as e:
                print(url, "發生問題。", e)
                print()
                timeSleepRandomly()
                timeSleepTwo()
                newsContent = None
                videoLinkInContent = None

        return videoLinkInContent, newsContent

Exemple #3

0

Afficher le fichier

    def requests(cls, url, headers):
        for i in range(3):
            try:
                timeSleepOne()
                res = requests.get(url, headers=headers)
                res.encoding = 'utf-8'
                timeSleepRandomly()

                soup = BeautifulSoup(res.text, 'html.parser')
                newsContent = [
                    row for row in soup.select_one(".story").stripped_strings
                ]

                #內文影片
                if soup.p.iframe:  #.attrs.get("src"):
                    videoLinkInContent = soup.p.iframe.attrs.get("src")
                    print("ETtoday 發現內文有影片：", videoLinkInContent)

                else:
                    videoLinkInContent = None

                break
            except requests.exceptions.ConnectionError as e:
                print(url, "發生問題。", e)
                print()
                timeSleepRandomly()
                timeSleepTwo()
                newsContent = None
                videoLinkInContent = None

        return videoLinkInContent, newsContent

Exemple #4

0

Afficher le fichier

    def requests(cls, url, headers):
        for i in range(3):
            try:
                timeSleepOne()
                res = requests.get(url, headers=headers)
                res.encoding = 'utf-8'
                timeSleepRandomly()

                soup = BeautifulSoup(res.text, 'html.parser')
                newsContent = ([
                    row for row in soup.select_one(".newsdetail_content").find(
                        "div", {
                            "class": "contxt margin_b20"
                        }).find("div", {
                            "id": "news_detail_div"
                        }).stripped_strings
                ])
                #內文影片
                if soup.select_one(".newsdetail_content").find(
                        "div", {
                            "class": "contxt margin_b20"
                        }).find("iframe", {"class": "video"}):
                    linkInContent = soup.select_one(
                        ".newsdetail_content").find(
                            "div", {
                                "class": "contxt margin_b20"
                            }).find("iframe", {
                                "class": "video"
                            }).attrs.get("src")
                    videoID = urlParseDealing.urlParsePath(
                        linkInContent
                    ).split("/")[
                        -1]  #videoID = link.split("/embed/")[1].split("?")[0]
                    videoLinkInContent = f"https://youtube.com/watch?v={videoID}"
                    print("TVBS 發現內文有影片：", videoLinkInContent)

                else:
                    videoLinkInContent = None

                break
            except requests.exceptions.ConnectionError as e:
                print(url, "發生問題。", e)
                print()
                timeSleepRandomly()
                timeSleepTwo()
                newsContent = None
                videoLinkInContent = None

        return videoLinkInContent, newsContent

Exemple #5

0

Afficher le fichier

    def requests(cls, url, headers):
        for i in range(3):
            try:
                timeSleepOne()
                res = requests.get(url, headers=headers)
                res.encoding = 'utf-8'
                timeSleepRandomly()

                soup = BeautifulSoup(res.text, 'html.parser')

                pass

                break
            except requests.exceptions.ConnectionError as e:
                print(url, "發生問題。", e)
                print()
                timeSleepRandomly()
                timeSleepTwo()

        return

Exemple #6

0

Afficher le fichier

    def requests(cls, url, headers):
        for i in range(3):
            try:
                timeSleepOne()

                if "https://ent.ltn.com.tw/news/" in url:
                    videoLinkInContent, newsContent = ltnRequests.requestsUrlWithENT(
                        url, headers)
                    break

                res = requests.get(url, headers=headers)
                res.encoding = 'utf-8'
                timeSleepRandomly()

                soup = BeautifulSoup(
                    res.text, 'lxml'
                )  # html.parser不夠力 https://ec.ltn.com.tw/article/paper/1295417 抓不到內容
                try:
                    newsContent = [
                        textMiningRegex.discardSpace(
                            textMiningRegex.replaceEscapeAlphabet(row.text))
                        for row in soup.select_one(".text").select("p")
                        if row.text != ""
                    ]
                    videoLinkInContent = None  # 內文本身沒有影片
                except AttributeError as e:
                    # https://news.ltn.com.tw/news/consumer/paper/1284005  --> https://ent.ltn.com.tw/news/paper/1284005
                    print("error code:", e, url)
                    videoLinkInContent, newsContent = ltnRequests.requestsUrlWithENT(
                        url, headers)
                break

            except requests.exceptions.ConnectionError as e:
                print(url, "發生問題。", e)
                print()
                timeSleepRandomly()
                timeSleepTwo()
                newsContent = None
                videoLinkInContent = None

        return videoLinkInContent, newsContent

Exemple #7

0

Afficher le fichier

    def requests(cls, url, headers):
        for i in range(3):
            try:
                timeSleepOne()
                res = requests.get(url, headers=headers)
                res.encoding = 'utf-8'
                timeSleepRandomly()

                soup = BeautifulSoup(res.text, 'html.parser')
                try:
                    newsContent = soup.find("article", {
                        "itemprop": "articleBody"
                    }).text.strip().split(" ")
                except AttributeError as e:
                    # url = "https://tw.news.yahoo.com/video/%E7%AF%80%E8%83%BD%E5%AE%B6%E9%9B%BB%E8%A3%9C%E5%8A%A9%E5%86%8D%E5%8A%A0%E7%A2%BC-%E8%B2%A8%E7%89%A9%E7%A8%85%E6%B8%9B%E5%85%8D%E9%96%8B%E8%B7%91-053307068.html"
                    # print("error code:", e, url)
                    try:
                        newsContent = soup.find("article").text.strip().split(
                            " ")
                    except AttributeError as e:
                        # "https://tw.news.yahoo.com/%E9%BB%83%E9%87%91%E9%80%B1%E5%A4%A7%E5%90%8C3c%E9%85%AC%E8%B3%93%E7%9B%9B%E5%85%B8-%E6%B6%BC%E5%A4%8F%E6%9C%80%E5%BC%B7%E6%AA%94-081101070.html": [
                        # "黃金週大同3C酬賓盛典涼夏最強檔",
                        print("error code:", "這則新聞爆炸了！", url)
                        newsContent = None

                videoLinkInContent = None  # 內文本身沒有影片
                break
            except requests.exceptions.ConnectionError as e:
                print(url, "發生問題。", e)
                print()
                timeSleepRandomly()
                timeSleepTwo()
                newsContent = None
                videoLinkInContent = None

        return videoLinkInContent, newsContent

Exemple #8

0

Afficher le fichier

def getPageInARowAdvanced(input, objectiveFolder, objective):
    thisPID = os.getpid()
    while True:
        print(thisPID, "===========================================")
        consecutiveUrl = input.get()
        searchword, page, totalPage, url = consecutiveUrl.split('+')
        # print(url)
        print(
            f"{thisPID}__{getPageInARowAdvanced_proc} 開始處理 {searchword} 的第 {page} 頁："
        )

        # 建立browser的代碼放進while True裡面，就可以避免「同一個瀏覽器」持續拜訪網頁時，被拒絕的情況。
        for i in range(3):
            try:
                timeSleepFour()

                browser = buildSplinterBrowserHeadless('chrome')

                timeSleepRandomly()

                browser.visit(url)

                browserWaitTime(browser)
                timeSleepTwo()

                tempHtml = browser.html
                timeSleepRandomly()

                soup = BeautifulSoup(tempHtml, 'html.parser')
                print(f"讀取{searchword}第 {page} 頁，成功！")
                break
            except (ConnectionRefusedError, TimeoutException,
                    WebDriverException) as e:
                print(
                    f"{thisPID}__{getPageInARowAdvanced_proc} 讀取 {searchword} 第 {page} 頁有問題。",
                    e)
                print(
                    f"{thisPID}__{getPageInARowAdvanced_proc} 重建browser物件，進行再處理 {i} 次!"
                )
                timeSleepFour()
                timeSleepRandomly()
                soup = ""
            # else:
            #     print(f"讀取{searchword}第 {page} 頁，成功！")

        if not soup:
            badRequestRoute = f"{_BASE_PATH}/dataMunging/{objectiveFolder}/{objective}/badRequest"
            with open(f"{badRequestRoute}/badRequest_{searchword}.txt",
                      "a",
                      newline='',
                      encoding='utf-8') as f:  # newline沒作用...
                errorMessage = url + "\n"
                f.write(errorMessage)  #writelines作用在errorMessage是list時

        with open(
                f"{_BASE_PATH}/dataMunging/{objectiveFolder}/{objective}/{searchword}/{page}_{totalPage}_{searchword}.txt",
                'w',
                encoding='utf-8') as f:
            f.write(str(soup))
        print()
        print(f'{thisPID}  成功寫出  {searchword}  第{page}頁，總共{totalPage} 頁。')

        try:
            browser.quit()
            print(
                f"成功關閉 browser{thisPID}__{getPageInARowAdvanced_proc}++++++++++++++++++++++++++++++"
            )
        except:
            print(f"放棄 {thisPID}__{getPageInARowAdvanced_proc} 這個browser。")
            print(
                f"kill {thisPID}__{getPageInARowAdvanced_proc} >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>"
            )
            os.kill(thisPID, signal.SIGKILL)
        input.task_done()  #通知main process此次的input處理完成！
        end = timeCalculate()
        print(f'{thisPID}__getPageInARowAdvanced 累計耗時：{end-begin} 秒')

Exemple #9

0

Afficher le fichier

def getPageInARow(input, output, keywordUrlPair, objectiveFolder, objective):
    thisPID = os.getpid()
    while True:
        print(thisPID, "===========================================")
        searchword = input.get()
        print('getPageInARow is in new process %s, %s ' %
              (getPageInARow_proc, thisPID))
        print()
        eraseRawData(objectiveFolder, objective, searchword)
        mkdirForRawData(objectiveFolder, objective, searchword)

        url = keywordUrlPair[searchword]

        # 建立browser的代碼放進while True裡面，就可以避免「同一個瀏覽器」持續拜訪網頁時，被拒絕的情況。
        for i in range(3):
            try:
                timeSleepOne()
                timeSleepRandomly()

                browser = buildSplinterBrowserHeadless('chrome')

                timeSleepRandomly()

                browser.visit(url)

                browserWaitTime(browser)
                timeSleepTwo()

                tempHtml = browser.html

                timeSleepRandomly()
                soup = BeautifulSoup(tempHtml, 'html.parser')
                print(
                    f"讀取{searchword}第 1 頁>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>成功！"
                )
                break
            except (ConnectionRefusedError, TimeoutException,
                    WebDriverException) as e:
                print(
                    f"{thisPID}__{getPageInARow_proc}  讀取{searchword}第 1 頁有問題。",
                    e)
                print(
                    f"{thisPID}__{getPageInARow_proc}  重建browser物件，進行再處理 {i} 次!"
                )
                timeSleepFour()
                timeSleepRandomly()
                soup = ""
            # else:
            #     print(f"讀取{searchword}第 1 頁>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>成功！")

        try:
            totalPage = interDiv(searchNums(soup.select_one('.totalTxt').text),
                                 30)
        except AttributeError as e:
            print("getPageInARow 出錯", e)
            # 讓程式強制停下來
            raise

        print('------接下來要處理 ' + searchword + ' 的頁數---------', totalPage, '頁')
        print()

        with open(
                f"{_BASE_PATH}/dataMunging/{objectiveFolder}/{objective}/{searchword}/1_{totalPage}_{searchword}.txt",
                'w',
                encoding='utf-8') as f:
            f.write(str(soup))
        print()
        print(f'成功寫出  {searchword}  第 1 頁')

        i_browser = 1
        try:
            browser.quit()
            print(
                f"成功關閉 browser{getPageInARow_proc}++++++++++++++++++++++++++++++"
            )
        except:
            print(
                f"放棄 {thisPID}__{getPageInARow_proc} 的 第{i_browser}個browser。")
            i_browser += 1
            print(
                f"kill {thisPID}__{getPageInARow_proc} >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>"
            )
            os.kill(thisPID, signal.SIGKILL)

        # 休息久一點，讓所有searchword的第一頁都有被讀到。
        timeSleepEight()
        timeSleepEight()

        for num in range(2, totalPage + 1):
            strNum = str(num)
            consecutiveData = searchword + "+" + strNum + "+" + str(
                totalPage) + "+" + re.sub(r"curPage=1", f"curPage={strNum}",
                                          url)
            output.put(consecutiveData)
            # print(f'這裡是getPageInARow，準備送給  getPageInARowAdvanced  處理:  {searchword} 的 第 {strNum} 頁，總共{totalPage}')
            print()
        input.task_done()  #通知main process此次的input處理完成！
        end = timeCalculate()
        print(f'{thisPID}__getPageInARow 累計耗時：{end-begin} 秒')

Exemple #10

0

Afficher le fichier

    def requests(cls, url, headers):
        for i in range(3):
            try:
                timeSleepOne()
                res = requests.get(url, headers=headers)
                res.encoding = 'utf-8'
                timeSleepRandomly()

                soup = BeautifulSoup(res.text, 'html.parser')
                try:
                    newsContent = [
                        textMiningRegex.discardSpace(
                            textMiningRegex.replaceEscapeAlphabet(row.text))
                        for row in soup.select_one(".ndArticle_margin").select(
                            "p") if row.text != ""
                    ]
                    videoLinkInContent = None  # 內文本身沒有影片
                except AttributeError as e:  # AttributeError: 'NoneType' object has no attribute 'select'
                    soupStr = str(soup)
                    if "<br> \xa0</p>" in soupStr:
                        # "<br> \xa0</p>"  不需要變成 "<br> \\xa0</p>"
                        """
                        sqlalchemy.exc.OperationalError: (pymssql.OperationalError) (8152, b'String or binary data would be truncated.DB-Lib error message 8152, severity 16:\nGeneral SQL Server error: Check messages from the SQL Server\n')
                        [SQL: INSERT INTO selected_news_with_tfidf ([news_title_Id], [series_Id], [publisher_Id], news_content, video_link_in_content) VALUES (%(news_title_Id)s, %(series_Id)s, %(publisher_Id)s, %(news_content)s, %(video_link_in_content)s)]
                        [parameters: {'news_title_Id': '201912252', 'series_Id': UUID('9abd7eae-c361-496c-b10c-ae9fcf7be8bb'), 'publisher_Id': '5', 'news_content': '[\'<p> 今年農曆年節時間較早，家電採購需求較以往提早出現買氣，瞄準年前有汰換家中家電的需求，大同3C福利品特賣會特於12月底開跑，一路至明年1月初，提供消費者年前採購好選擇。<br> <br> 12月26日起至2020年1月8日止，全台各地共舉辦20場大同3C福利品特賣會，大小家電可在此一次 ... 
                        (3925 characters truncated) ... aws.com/ap-ne-1-prod/public/FLCZDN5FBRQBN6E6E3S7RP7IW4.jpg","version":"0.10.3","width":640},{"_id":"IO25XHAIRJE3FCUWV7YTXI66CY","type":"raw_html",\']', 'video_link_in_content': None}]
                        (Background on this error at: http://sqlalche.me/e/e3q8)
                        """

                        # https://tw.appledaily.com/property/20191226/WCUY7RP45D2V45RLRN3RULU2QU/
                        tmpStr = soupStr.split(
                            """<script type="application/javascript">window.Fusion="""
                        )[1].split("Fusion.globalContent=")[1].split(
                            '"content":"')[1].split("<br> \xa0</p>")[0]
                        newsContent = [
                            row for row in BeautifulSoup(
                                tmpStr, "html.parser").text.split(" ")
                            if row != ""
                        ]
                    else:
                        # https://tw.appledaily.com/gadget/20190927/IFU7ML7HXNAL2GHDNKOZULDNOU/
                        tmpStr = soupStr.split(
                            """<script type="application/javascript">window.Fusion="""
                        )[1].split("Fusion.globalContent=")[1].split(
                            '"content":"')[1].split("更多「")[0]
                        newsContent = [
                            row for row in tmpStr.split("<br />&nbsp;<br />")
                            if row != ""
                        ]

                        if len("".join(newsContent)) >= 3500:
                            # elif '<br />&nbsp;"' in soupStr:
                            # https://tw.appledaily.com/gadget/20191029/KSU3NPGRYURXTCI3COIUE6KMNM/
                            print(
                                f"appledaily news content exceeds 3500: {url}")
                            tmpStr = soupStr.split(
                                """<script type="application/javascript">window.Fusion="""
                            )[1].split("Fusion.globalContent=")[1].split(
                                '"content":"')[1].split('<br />&nbsp;"}')[0]
                            newsContent = [
                                row
                                for row in tmpStr.split("<br />&nbsp;<br />")
                                if row != ""
                            ]

                    videoLinkInContent = None  # 內文本身沒有影片

                break
            except requests.exceptions.ConnectionError as e:
                print(url, "發生問題。", e)
                print()
                timeSleepRandomly()
                timeSleepTwo()
                newsContent = None
                videoLinkInContent = None

        return videoLinkInContent, newsContent

Exemple #11

0

Afficher le fichier

    def requests(cls, url, headers):
        for i in range(3):
            try:
                timeSleepOne()
                res = requests.get(url, headers=headers)
                res.encoding = 'utf-8'
                timeSleepRandomly()

                soup = BeautifulSoup(res.text, 'html.parser')
                newsContent = [
                    textMiningRegex.discardSpace(
                        textMiningRegex.replaceEscapeAlphabet(row.text)) for
                    row in soup.select_one("#story_body_content").select("p")
                    if row.text != ""
                ]
                videoLinkInContent = None  # 內文本身沒有影片
                break
            except AttributeError as e:

                try:
                    # 20200207 udn網頁改版
                    newsContent = [
                        textMiningRegex.discardSpace(
                            textMiningRegex.replaceEscapeAlphabet(row.text))
                        for row in soup.find("article", {
                            "class": "article-content"
                        }).find_all("p") if row.text != ""
                    ]
                except AttributeError as e:
                    # 網頁拜訪若是404，html長的如下樣子。
                    '''
                    response404 = """<html>
                                <head>
                                <script>
                                                        var d = new Date();
                                                        d.setTime(d.getTime() + (300*1000));
                                                        var expires = "expires="+ d.toUTCString();
                                                        document.cookie = "burl=my-test-page01;" + expires + ";path=/";
                                                </script>
                                <!-- Google Tag Manager -->
                                <script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
                                                new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],
                                                j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=
                                                'https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);
                                                })(window,document,'script','dataLayer','GTM-5CMHR66');</script>
                                <!-- End Google Tag Manager --><script>
                                                (function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){
                                                (i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new Date();a=s.createElement(o),
                                                m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)
                                                })(window,document,'script','//www.google-analytics.com/analytics.js','ga');
                                                        </script>
                                <!-- #Location: /inc/meta/trace_ga -->
                                </head>
                                <body>
                                <!-- Google Tag Manager (noscript) -->
                                <noscript><iframe height="0" src="https://www.googletagmanager.com/ns.html?id=GTM-5CMHR66" style="display:none;visibility:hidden" width="0"></iframe></noscript>
                                <!-- End Google Tag Manager (noscript) -->
                                <script>
                                                window.location="/news/e404?nver";
                                        </script>
                                </body>
                                </html>"""
                    
                    '''

                    if searchWordTrueOrFalse(
                            "404",
                            str(soup.select_one("body").select_one("script"))
                    ):  #'<script>\n                window.location="/news/e404?nver";\n        </script>'
                        # https://udn.com/news/story/7238/3600804
                        print(url, "發生問題：404!")
                        newsContent = "404_None"
                    else:
                        # 不知名情況查看
                        print(soup)
                        newsContent = "404_None"
                        raise

                videoLinkInContent = None  # 內文本身沒有影片
                break
            except requests.exceptions.ConnectionError as e:
                print(url, "發生問題。", e)
                print()
                timeSleepRandomly()
                timeSleepTwo()
                newsContent = None
                videoLinkInContent = None

        return videoLinkInContent, newsContent

Exemple #12

0

Afficher le fichier

Fichier : bureauEnergyMulti_1.py Projet : UnCarter25le/iSelect3C

def getPageInARow(input, headers, objectiveFolder, objective, *args):
    thisPID = os.getpid()
    while True:
        # print(thisPID,"===========================================")
        consecutiveUrl = input.get()
        searchword, correctUrl, txtFileRoute = consecutiveUrl.split("+")

        fileName = txtFileRoute.split("/")[-1]
        page = fileName.split("_")[0]
        totalPage = fileName.split("_")[1]

        # print('getPageInARow is in new process %s, %s ' % (getPageInARow_proc, os.getpid()))
        # print('------接下來要處理 ' + searchword + '第' ,page, '頁---------共', totalPage, '頁')

        for i in range(4):
            if i <= 2:
                try:
                    timeSleepRandomly()
                    res = requests.get(correctUrl, headers=headers)
                    res.encoding = 'utf-8'
                    timeSleepRandomly()
                    timeSleepOne()
                    soup = BeautifulSoup(res.text, 'html.parser')
                    break
                except requests.exceptions.ConnectionError as e:
                    print(fileName, "發生問題。", i, e)
                    print()
                    timeSleepRandomly()
                    timeSleepTwo()
                    soup = ""
            else:
                try:
                    timeSleepEight()
                    timeSleepRandomly()
                    res = requests.get(correctUrl, headers=headers)
                    res.encoding = 'utf-8'
                    timeSleepRandomly()
                    soup = BeautifulSoup(res.text, 'html.parser')
                    break
                except requests.exceptions.ConnectionError as e:
                    print(fileName, "發生問題。", i, e)
                    print()
                    soup = ""

        # 若觸發第2個狀況，則強命為空字串。
        if judgeSoup(soup, searchword, correctUrl, txtFileRoute) == "check":
            soup = ""

        # 原來
        # timeSleepOne()
        # timeSleepRandomly()
        # res = requests.get(correctUrl, headers=headers)
        # res.encoding = 'utf-8'
        # timeSleepRandomly()
        # soup  = BeautifulSoup(res.text,'html.parser')

        with open(txtFileRoute, 'w', encoding='utf-8') as f:
            f.write(str(soup))
        # print(f"成功寫出  {searchword}  第 {page} 頁， 共 {totalPage} 頁。")
        end = timeCalculate()
        # print('getPageInARow 累計耗時：{0} 秒'.format(end-begin))
        input.task_done()  #通知main process此次的input處理完成！

Exemple #13

0

Afficher le fichier

Fichier : bureauEnergyMulti_2.py Projet : UnCarter25le/iSelect3C

def detailPageInARow(input,  headers, objectiveFolder, objective, *args):
    """
    As many as 28,000 detail urls we are supposed to crawl would inevitalby leave some processes to fail to get the correct responses.
    As such, we should extend more time while crawling , or establish exception handler in porgrams.
    
    """
    # begin = timeCalculate()
    thisPID = os.getpid()
    while True:
        # print(thisPID,"===========================================")
        
        consecutiveUrl = input.get()
        searchword, url, txtFileRoute = consecutiveUrl.split("+")
        
        # print('detailPageInARow is in new process %s, %s ' % (detailPageInARow_proc, thisPID))
        # print()

        for i in range(4):
          if i <=2:
            try:
              timeSleepTwo()
              res = requests.get(url, headers=headers)
              res.encoding = 'utf-8'
              timeSleepRandomly()
              soup  = BeautifulSoup(res.text,'html.parser')
              break
            except requests.exceptions.ConnectionError as e:
              print(url, "發生問題。", e)
              print()
              timeSleepRandomly()
              timeSleepTwo()
              timeSleepTwo()
              soup = ""
          else:
            try:
              timeSleepEight()
              res = requests.get(url, headers=headers)
              res.encoding = 'utf-8'
              timeSleepRandomly()
              soup  = BeautifulSoup(res.text,'html.parser')
              break
            except requests.exceptions.ConnectionError as e:
              print(txtFileRoute, "發生問題。", e)
              print()
              soup = ""
        
        # 若觸發第2個狀況，則強命為空字串。
        if judgeSoup(soup, searchword, url, txtFileRoute) == "check":
          soup = ""
        


        with open(txtFileRoute, 'w', encoding='utf-8')as f:
            f.write(str(soup))
        
        fileName = txtFileRoute.split("/")[-1]
        productIndex = fileName.split("_")[0]
        productNums = fileName.split("_")[1]
        print(f"{thisPID}__成功寫出  {searchword}  detail頁， 第 {productIndex} 項， 共 {productNums} 項。")
            
        timeSleepRandomly()

        # print('這裡是 detailPageInARow 完成: ' + fileName + " 的爬取。")
        end = timeCalculate()
        # print('detailPageInARow 累計耗時：{0} 秒'.format(end-begin))
        input.task_done()

Exemple #14

0

Afficher le fichier

Fichier : bureauEnergyMulti_2.py Projet : ilyi1116/iSelect3C

def detailPageInARow(input, headers, objectiveFolder, objective, *args):
    """
    As many as 28,000 detail urls we are supposed to crawl would inevitalby leave some processes to fail to get the correct responses.
    As such, we should extend more time while crawling , or establish exception handler in porgrams.
    
    """
    # begin = timeCalculate()
    thisPID = os.getpid()
    while True:
        # print(thisPID,"===========================================")

        consecutiveUrl = input.get()
        searchword, url, txtFileRoute = consecutiveUrl.split("+")

        # print('detailPageInARow is in new process %s, %s ' % (detailPageInARow_proc, thisPID))
        # print()

        for i in range(3):
            try:
                timeSleepOne()
                res = requests.get(url, headers=headers)
                res.encoding = 'utf-8'
                timeSleepRandomly()
                soup = BeautifulSoup(res.text, 'html.parser')
                break
            except requests.exceptions.ConnectionError as e:
                print(url, "發生問題。", e)
                print()
                timeSleepRandomly()
                timeSleepTwo()
                soup = ""

        judgeSoup(soup, searchword, url, txtFileRoute)
        # if not soup:
        #   badRequestRoute = "/".join(txtFileRoute.split("/")[:-3]) + "/badRequest"
        #   with open(f"{badRequestRoute}/badRequest_{searchword}.txt", "a",  newline='', encoding='utf-8')as f: # newline沒作用...
        #       errorMessage = url + "\n"
        #       f.write(errorMessage)   #writelines作用在errorMessage是list時
        # elif soup.select_one('head').text.strip() == 'Service Unavailable':
        #   """

        #   「
        #   <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN""http://www.w3.org/TR/html4/strict.dtd">

        #   <html><head><title>Service Unavailable</title>
        #   <meta content="text/html; charset=utf-8" http-equiv="Content-Type"/></head>
        #   <body><h2>Service Unavailable</h2>
        #   <hr/><p>HTTP Error 503. The service is unavailable.</p>
        #   </body></html>
        #   」

        #   """
        #   soup = ""

        with open(txtFileRoute, 'w', encoding='utf-8') as f:
            f.write(str(soup))

        fileName = txtFileRoute.split("/")[-1]
        productIndex = fileName.split("_")[0]
        productNums = fileName.split("_")[1]
        print(
            f"{thisPID}__成功寫出  {searchword}  detail頁， 第 {productIndex} 項， 共 {productNums} 項。"
        )

        timeSleepRandomly()

        # print('這裡是 detailPageInARow 完成: ' + fileName + " 的爬取。")
        end = timeCalculate()
        # print('detailPageInARow 累計耗時：{0} 秒'.format(end-begin))
        input.task_done()

Exemple #15

0

Afficher le fichier

Fichier : momoMallMuti.py Projet : UnCarter25le/iSelect3C

def getPageInARow(input, output, folderWorker, momoMallBrowser):
    thisPID = os.getpid()
    while True:
        print(thisPID, "===========================================")
        searchword = input.get()
        print('getPageInARow is in new process %s, %s ' %
              (getPageInARow_proc, thisPID))
        folderWorker.eraseRawData(searchword)
        folderWorker.mkdirForRawData(searchword)

        url = momoMallBrowser.keywordResourcePair._momoMallKeywordUrlPair[
            searchword]

        # 建立browser的代碼放進while True裡面，就可以避免「同一個瀏覽器」持續拜訪網頁時，被拒絕的情況。
        for i in range(4):
            try:
                timeSleepOne()
                timeSleepRandomly()

                browser = momoMallBrowser.intersectionForCrawl(
                    folderWorker.objective)

                timeSleepRandomly()

                browser.visit(url)

                browserWaitTime(browser)
                timeSleepTwo()

                #點擊「準確度」，頁數跳至第1頁
                try:
                    buyingTendency = momoMallBrowser.browserClickSearchType(
                        browser, 1)
                    browserWaitTime(browser)
                    timeSleepTwo()
                except AttributeError as e:
                    print(
                        f"{thisPID}__{getPageInARow_proc}  {searchword} 第1頁 點擊準確度有問題。",
                        e)
                    print(
                        f"{thisPID}__{getPageInARow_proc}  重建browser物件，進行再處理 {i} 次!"
                    )
                    browserQuit(browser, thisPID, getPageInARow_proc)
                    timeSleepFour()
                    soup = ""
                    continue

                tempHtml = browser.html

                timeSleepRandomly()
                soup = BeautifulSoup(tempHtml, 'lxml')
                print(
                    f"-----------------讀取{searchword}_{buyingTendency}第 1 頁-----------------成功！"
                )

                try:
                    ## current page and total page '頁數5/286'

                    pageState = browser.find_by_xpath(
                        '//*[@id="bt_2_layout_Content"]/div[2]/dl/dt/span')
                    totalPage = int(pageState.text.split('/')[1])
                    currentPage = int(
                        numsHandler.searchFloatNums(
                            pageState.text.split('/')[0]))
                    print(
                        f"-----------------讀取{searchword}_{buyingTendency} 總頁數-----------------成功！"
                    )
                except AttributeError as e:
                    print(f"getPageInARow __{searchword}__出錯", e, "重抓一次！")
                    # 讓程式強制停下來 # 觀察下來，「raise」只會讓當前執行的process停下來，並不會讓「整體」process停下來。
                    # 因此不適合用「raise」。
                    # raise
                    currentPage = 1  # 自訂
                    totalPage = 3  # 自訂
                    continue
                break
            except (ConnectionRefusedError, TimeoutException,
                    WebDriverException) as e:
                print(
                    f"{thisPID}__{getPageInARow_proc}  讀取{searchword}第 1 頁有問題。",
                    e)
                print(
                    f"{thisPID}__{getPageInARow_proc}  重建browser物件，進行再處理 {i} 次!"
                )
                browserQuit(browser, thisPID, getPageInARow_proc)
                timeSleepFour()
                timeSleepRandomly()
                soup = ""
            except StaleElementReferenceException as e:
                print(
                    "----------------StaleElementReferenceException----------------"
                )
                print(
                    f"{thisPID}__{getPageInARow_proc}  讀取{searchword}第 1 頁有問題。",
                    e)
                print(
                    f"{thisPID}__{getPageInARow_proc}  重建browser物件，進行再處理 {i} 次!"
                )
                browserQuit(browser, thisPID, getPageInARow_proc)
                timeSleepFour()
                timeSleepRandomly()
                soup = ""

        if not soup:
            errorMessage = f"{url}__{currentPage}__" + "\n"
            folderWorker.writeOutFile(
                f"{folderWorker._BASE_PATH}/dataMunging/{folderWorker.objectiveFolder}/{folderWorker.objective}/badRequest",
                f"badRequest_{searchword}.txt",
                errorMessage,
                writeOutType="a")

        folderWorker.writeOutFile(
            f"{folderWorker._BASE_PATH}/dataMunging/{folderWorker.objectiveFolder}/{folderWorker.objective}/{searchword}",
            f"1_{totalPage}_{searchword}.txt", soup)

        print(f'成功寫出  {searchword}  第 {currentPage} 頁')

        print('------接下來要處理 ' + searchword + ' 的頁數---------', totalPage, '頁')

        browserQuit(browser, thisPID, getPageInARow_proc)

        # 休息久一點，讓所有searchword的第一頁都有被讀到。
        timeSleepEight()
        timeSleepEight()
        timeSleepEight()

        for num in range(2, totalPage + 1):
            strNum = str(num)
            consecutiveData = searchword + "+" + strNum + "+" + str(totalPage)
            output.put(consecutiveData)
            # print(f'這裡是getPageInARow，準備送給  getPageInARowAdvanced  處理:  {searchword} 的 第 {strNum} 頁，總共{totalPage}')
            # print()

        input.task_done()  #通知main process此次的input處理完成！
        end = timeCalculate()
        print(f'{thisPID}__getPageInARow 累計耗時：{end-begin} 秒')

Exemple #16

0

Afficher le fichier

Fichier : momoMallMuti.py Projet : UnCarter25le/iSelect3C

def getPageInARowAdvanced(input, folderWorker, momoMallBrowser):
    """
    開始對POST網址進行splinter的點擊
    """
    thisPID = os.getpid()
    while True:
        # print(thisPID,"===========================================")
        consecutiveData = input.get()
        searchword, currentPage, totalPage = consecutiveData.split('+')

        url = momoMallBrowser.keywordResourcePair._momoMallKeywordUrlPair[
            searchword]

        # 建立browser的代碼放進while True裡面，就可以避免「同一個瀏覽器」持續拜訪網頁時，被拒絕的情況。
        for i in range(4):
            try:
                timeSleepFour()

                browser = momoMallBrowser.intersectionForCrawl(
                    folderWorker.objective)

                timeSleepRandomly()

                browserSetWindowSize(browser, horizon=1920, vertical=1080)
                timeSleepOne()

                browser.visit(url)

                browserWaitTime(browser)
                timeSleepTwo()

                #點擊「準確度」，頁數跳至第1頁
                try:
                    buyingTendency = momoMallBrowser.browserClickSearchType(
                        browser, 1)
                    browserWaitTime(browser)
                    timeSleepTwo()
                except AttributeError as e:
                    print(
                        f"{thisPID}__{getPageInARowAdvanced_proc}  {searchword} 在第{currentPage}頁點擊準確度有問題。",
                        e)
                    print(
                        f"{thisPID}__{getPageInARowAdvanced_proc}  重建browser物件，進行再處理 {i} 次!"
                    )
                    browserQuit(browser, thisPID, getPageInARowAdvanced_proc)
                    timeSleepFour()
                    soup = ""
                    continue

                # 點擊至正確的頁數
                momoMallBrowser.browserClickPageNumber(browser, currentPage,
                                                       totalPage, searchword)

                tempHtml = browser.html
                timeSleepRandomly()

                #擬人
                momoMallBrowser.humanSimulate(browser)

                soup = BeautifulSoup(tempHtml, 'lxml')
                # print(f"讀取{searchword}第 {currentPage} 頁，成功！")
                break
            except (ConnectionRefusedError, TimeoutException,
                    WebDriverException) as e:
                print(
                    f"{thisPID}__{getPageInARowAdvanced_proc} 讀取 {searchword} 第 {currentPage} 頁有問題。",
                    e)
                print(
                    f"{thisPID}__{getPageInARowAdvanced_proc} 重建browser物件，進行再處理 {i} 次!"
                )
                browserQuit(browser, thisPID, getPageInARowAdvanced_proc)
                timeSleepFour()
                timeSleepRandomly()
                soup = ""
            # else:
            #     print(f"讀取{searchword}第 {page} 頁，成功！")

        if not soup:
            errorMessage = f"{url}__{currentPage}__" + "\n"
            folderWorker.writeOutFile(
                f"{folderWorker._BASE_PATH}/dataMunging/{folderWorker.objectiveFolder}/{folderWorker.objective}/badRequest",
                f"badRequest_{searchword}.txt",
                errorMessage,
                writeOutType="a")

        folderWorker.writeOutFile(
            f"{folderWorker._BASE_PATH}/dataMunging/{folderWorker.objectiveFolder}/{folderWorker.objective}/{searchword}",
            f"{currentPage}_{totalPage}_{searchword}.txt", soup)

        # print(f'{thisPID}  成功寫出  {searchword}  第{currentPage}頁，總共{totalPage} 頁。')

        browserQuit(browser, thisPID, getPageInARowAdvanced_proc)

        input.task_done()  #通知main process此次的input處理完成！
        end = timeCalculate()