コード例 #1
0
def main(url, checkRange=30):
    header.processBegin(url=url)
    header.clearFolder()  #[2019.02.11]

    try:
        soup = request2soup(url)

        df_1 = parsingTitle(soup, checkRange)
        if len(df_1) != 0:
            #outputCsv(df_1, "第一層結果", FinalPath)
            header.outputCsv(df_1, "第一層結果")
            df_2 = parsingDetail(df_1)
            #outputCsv(df_2, "第二層結果", FinalPath)
            header.outputCsv(df_2, "第二層結果")
            header.RESULT_COUNT = len(df_2)
        header.zipFile()
        header.createInfoFile()
        header.createOKFile()
        header.outputLastResult(df_1, header.lastResult,
                                checkRange)  #[2019.02.11]新增產出lastResult方法
    except:
        logging.error("執行爬網作業失敗")
        traceback.print_exc()
        header.createInfoFile()

    header.processEnd()
コード例 #2
0
def main(url, tabNumber, checkRange=15):

    header.processBegin()
    header.clearFolder()
    DownloadTool = SeleniumUtil.ChromeDownload()
    DownloadTool.setDownLoadTempPath(header.TEMP_PATH)
    DownloadTool.setDownLoadFinalPath(FinalPath)
    chrome_options = DownloadTool.getChromeOptions()
    driver = webdriver.Chrome(
        chrome_options=chrome_options)  # open chrome browser with Options
    try:
        if tabNumber >= 19 and tabNumber <= 22 and isinstance(tabNumber, int):
            url = url + str(tabNumber)
        else:
            raise ValueError("tabNumber 必須為 19 到 22 的整數")

        driver.get(url)
        df_1 = parsingTitle(url, driver, checkRange)
        if len(df_1) != 0:
            header.outputCsv(df_1, "第一層結果", FinalPath)

            df_2 = parsingDetail(df_1, tabNumber, FinalPath)
            header.outputCsv(df_2, "第二層結果", FinalPath)
            header.RESULT_COUNT = len(df_1)
            header.zipFile()
            header.createInfoFile()
            header.createOKFile()
            header.outputLastResult(df_1, header.lastResult,
                                    checkRange)  # 2019-02-01新增產出lastResult方法
    except:
        logging.error("執行爬網作業失敗")
        header.EXIT_CODE = -1
        traceback.print_exc()

    header.processEnd()
コード例 #3
0
ファイル: JAXRDB011.py プロジェクト: s8888/WebCrawler
def main(url, checkRange = 30):
    
    header.processBegin(url = url)
    
    header.clearFolder()
    
    try:
        soup = request2soup(url, 1)
        
        df_1 = parsingTitle(soup, checkRange)
        if len(df_1) != 0:
            header.outputCsv(df_1, "第一層結果")
        
            df_2 = parsingDetail(df_1)
            header.outputCsv(df_2, "Result")
            header.RESULT_COUNT= len(df_2)   
        
        header.zipFile()
        header.createInfoFile()
        header.createOKFile()
        
    except:
        print("執行爬網作業失敗")
        logging.error("執行爬網作業失敗")
        header.EXIT_CODE = -1
        traceback.print_exc()
        header.createInfoFile()
    
    header.processEnd()
コード例 #4
0
def parsingTitle(soup, checkRange):
    try:
        # 取得上次爬網結果
        lastResultPath = header.LAST_RESULT_PATH  # +"/lastResult.csv"#[2019.02.11]

        if os.path.isfile(lastResultPath):
            lastResult = pd.read_csv(lastResultPath)
        else:
            lastResult = pd.DataFrame()
        header.lastResult = lastResult  #[2019.02.11]新增全域變數

        # 爬網日期區間為一個禮拜
        endDate = datetime.date.today()
        strDate = (endDate - datetime.timedelta(days=checkRange)).isoformat()

        df = pd.DataFrame(
            columns=["WEB_ADDR", "CRL_DATE", "ISS_DATE", "TITL", "LNK_URL"])
        soup = request2soup(url)

        # 資料處理
        result = dataProcess_Title(soup, strDate)

        d = {
            'WEB_ADDR': url,
            'CRL_DATE': result['crawl_date'],
            'ISS_DATE': '',
            'TITL': result['titles_result'],
            'LNK_URL': result['links']
        }

        df = df.append(pd.DataFrame(data=d))
        # 若與上次發文日期和標題相同,則跳至下一筆
        if not lastResult.empty:
            for index, row in df.iterrows():
                if row['TITL'] in list(lastResult['TITL']):
                    df.drop(index, inplace=True)

        if len(df) == 0:
            logging.critical("%s 至 %s 間無資料更新" % (strDate, endDate))
        else:
            df.index = [i for i in range(df.shape[0])]  # reset
            lastResult = lastResult.append(df)
            lastResult.index = [i for i in range(lastResult.shape[0])]  # reset
            lastResult = lastResult[pd.to_datetime(lastResult['CRL_DATE']) >= (
                datetime.date.today() - datetime.timedelta(days=checkRange))]
            header.outputCsv(lastResult, "lastResult", header.CRAWL_LIST_PATH)

    except:
        header.EXIT_CODE = -1
        logging.error("爬取主旨列表失敗")
        traceback.print_exc()
    return df
コード例 #5
0
def main(url, checkRange = 30):
    header.processBegin()
    header.clearFolder()
    try:
        df_1 = parsingTitle(url, checkRange)
        if len(df_1) == 0:
            return
        header.outputCsv(df_1, "第一層結果", FinalPath)
        df_2 = parsingDetail(df_1, FinalPath)
        header.outputCsv(df_2, "第二層結果", FinalPath)
        header.RESULT_COUNT = len(df_1)
        header.zipFile()
        header.createInfoFile()
        header.createOKFile()
        header.outputLastResult(df_1, header.lastResult, checkRange)   # 2019-02-01新增產出lastResult方法
    except:
        logging.error("執行爬網作業失敗")
        header.EXIT_CODE = -1
        traceback.print_exc()
    
    header.processEnd()
コード例 #6
0
def parsingTitle(soup, checkRange):
    try:
        # 取得上次爬網結果
        lastResultPath = header.LAST_RESULT_PATH + "/lastResult.csv"
        if os.path.isfile(lastResultPath):
            lastResult = pd.read_csv(lastResultPath)
        else:
            lastResult = pd.DataFrame()

        # 爬網日期區間為一個禮拜
        endDate = datetime.date.today()
        strDate = (endDate - datetime.timedelta(days=checkRange)).isoformat()

        totalPage = soup.select(".page")[0].text.split("/")[1]  # 總頁數
        ending = False

        df = pd.DataFrame(
            columns=["WEB_ADDR", "CRL_DATE", "ISS_DATE", "TITL", "LNK_URL"])

        for i in range(int(totalPage)):
            if (i != 0):
                soup = request2soup(url, i + 1)

            try:
                sorts = soup.select(".sort1")
                sorts = [x.text.strip() for x in sorts]

                dates = soup.select(".pdate1")
                dates = [x.text.strip() for x in dates]

                titles = soup.select(".ptitle1")
                titles = [x.text.strip() for x in titles]

                links = soup.select(".ptitle1 a")
                links = [
                    "https://www.ib.gov.tw/ch/" + x.get("href") for x in links
                ]

                idx = pd.Series([False] * len(dates))
                for j in range(len(dates)):
                    date = dates[j]
                    if date < strDate:  # 若發文日期小於開始日期, 則結束爬取主旨
                        ending = True
                        break
                    idx[j] = True
                d = {
                    "WEB_ADDR": url,
                    "CRL_DATE": endDate,
                    "ISS_DATE": dates,
                    "TITL": titles,
                    "LNK_URL": links
                }

                df = df.append(pd.DataFrame(data=d)[idx])  # append page

                # 若結束爬取主旨, 停止爬取剩下的 pages
                if ending:
                    break
            except:
                logging.error("爬取第 %s 頁主旨發生錯誤" % str(i + 1))
                traceback.print_exc()

        df.index = [i for i in range(df.shape[0])]  # reset Index
        header.outputCsv(df, "lastResult", header.LAST_RESULT_PATH)

        if not lastResult.empty:
            # 若與上次發文日期和標題相同,則跳至下一筆
            for i in range(len(df)):
                for j in range(len(lastResult)):
                    if (df.ISS_DATE[i] == lastResult.ISS_DATE[j]) & (
                            df.TITL[i] == lastResult.TITL[j]):
                        df.drop(i, inplace=True)
                        break

        if len(df) == 0:
            logging.critical("%s 至 %s 間無資料更新" % (strDate, endDate))
        else:
            df.index = [i for i in range(df.shape[0])]  # reset

        return df

    except:
        logging.error("爬取主旨列表失敗")
        traceback.print_exc()
        return pd.DataFrame(
            columns=["WEB_ADDR", "CRL_DATE", "ISS_DATE", "TITL", "LNK_URL"])
コード例 #7
0
ファイル: EP_WC0100.py プロジェクト: s8888/WebCrawler
def main():
    header.processBegin(url=WEB_URL)
    header.clearFolder()

    try:
        reqMap = {
            # [20190402] 新增查詢的縣市
            "REGION_IDs": {
                "1": "台北市",
                "2": "基隆市",
                "3": "新北市",
                "4": "新竹市",
                "5": "新竹縣",
                "6": "桃園市",
                "7": "苗栗縣",
                "8": "台中市",
                "10": "彰化縣",
                "11": "南投縣",
                "12": "嘉義市",
                "13": "嘉義縣",
                "14": "雲林縣",
                "15": "台南市",
                "17": "高雄市",
                "19": "屏東縣",
                "21": "宜蘭縣",
                "22": "台東縣",
                "23": "花蓮縣",
                "24": "澎湖縣",
                "25": "金門縣",
                "26": "連江縣"
            },
            "QUERY_TYPEs": {
                '1': "住宅用地",
                '2': "商業用地",
                '3': "工業用地"
            },
            "CONDITIONS": {
                "is_new_list": '1',
                "type": '2',
                "searchtype": '1',
                "firstRow": '0',
                "kind": "11",  # 廠房土地出售
                "area": "300,"  # 最小 300 坪
            }
        }

        standbyDataFrame, historyDataFrame = parsingTitle(reqMap)

        if len(standbyDataFrame) < 1:
            # 無資料更新
            logMsg = "無資料更新,爬網日期:" + TODAY
            print(logMsg)
            logging.critical(logMsg)
        else:
            finishDataFrame, detailDataFrame = parsingDetail(standbyDataFrame)

            header.outputCsv(detailDataFrame, header.PROJECT)
            header.RESULT_COUNT = len(detailDataFrame)

            # 更新 crawlHistory 檔案
            updateHistoryDataFrame = pd.concat(
                [historyDataFrame, finishDataFrame], ignore_index=True)
            header.outputCsv(updateHistoryDataFrame, "crawlHistory",
                             header.LAST_RESULT_PATH)

        header.zipFile()
        header.createInfoFile()
        header.createOKFile()

    except:
        setErrorMessage("執行爬網作業失敗")
        header.createInfoFile()
        header.zipFile(zipFolder=header.LOG_PATH, zipResultWithLog=False)

    header.processEnd()