Exemple #1
0
def main(checkRange=30):
    header.processBegin(url=WEB_URL)
    header.clearFolder()

    try:
        # 爬網日期區間
        endDate = datetime.date.today()
        strDate = (endDate - datetime.timedelta(days=checkRange)).isoformat()
        df_1 = parsingTitle(strDate, endDate)

        # 確認是否有新資料待爬取
        RESULT_COUNT = len(df_1)
        if RESULT_COUNT < 1:
            logging.critical("%s 至 %s 間無資料更新" % (strDate, endDate))
        else:
            header.outputCsv(df_1, "第一層結果", FINAL_PATH)

            df_2 = parsingDetail(df_1)
            header.outputCsv(df_2, "第二層結果", FINAL_PATH)

            header.RESULT_COUNT = RESULT_COUNT

            # 更新 crawlHistory 檔案
            header.outputLastResult(df_1, header.lastResult, checkRange)

        header.zipFile()
        header.createInfoFile()
        header.createOKFile()
    except:
        header.EXIT_CODE = -1
        logging.error("執行爬網作業失敗")
        logging.error(str(traceback.format_exc()))

    header.processEnd()
Exemple #2
0
def main(url, checkRange=30):
    header.processBegin(url=url)
    header.clearFolder()

    try:
        soup = request2soup(url, 1)

        df_1 = parsingTitle(soup, checkRange)
        if len(df_1) != 0:
            header.outputCsv(df_1, "第一層結果")

            df_2 = parsingDetail(df_1)
            header.outputCsv(df_2, "第二層結果")
            header.RESULT_COUNT = len(df_2)
        header.zipFile()
        header.createInfoFile()
        header.createOKFile()
        header.outputLastResult(df_1, header.lastResult,
                                checkRange)  #[2019.02.11]新增產出lastResult方法
    except:
        logging.error("執行爬網作業失敗")
        traceback.print_exc()
        header.createInfoFile()

    header.processEnd()
Exemple #3
0
def parsingTitle(soup, checkRange):
    try:
        # 取得上次爬網結果
        lastResultPath = header.LAST_RESULT_PATH  # +"/lastResult.csv"#[2019.02.11]

        if os.path.isfile(lastResultPath):
            lastResult = pd.read_csv(lastResultPath)
        else:
            lastResult = pd.DataFrame()
        header.lastResult = lastResult  #[2019.02.11]新增全域變數

        # 爬網日期區間為一個禮拜
        endDate = datetime.date.today()
        strDate = (endDate - datetime.timedelta(days=checkRange)).isoformat()

        df = pd.DataFrame(
            columns=["WEB_ADDR", "CRL_DATE", "ISS_DATE", "TITL", "LNK_URL"])
        soup = request2soup(url)

        # 資料處理
        result = dataProcess_Title(soup, strDate)

        d = {
            'WEB_ADDR': url,
            'CRL_DATE': result['crawl_date'],
            'ISS_DATE': '',
            'TITL': result['titles_result'],
            'LNK_URL': result['links']
        }

        df = df.append(pd.DataFrame(data=d))
        # 若與上次發文日期和標題相同,則跳至下一筆
        if not lastResult.empty:
            for index, row in df.iterrows():
                if row['TITL'] in list(lastResult['TITL']):
                    df.drop(index, inplace=True)

        if len(df) == 0:
            logging.critical("%s 至 %s 間無資料更新" % (strDate, endDate))
        else:
            df.index = [i for i in range(df.shape[0])]  # reset
            lastResult = lastResult.append(df)
            lastResult.index = [i for i in range(lastResult.shape[0])]  # reset
            lastResult = lastResult[pd.to_datetime(lastResult['CRL_DATE']) >= (
                datetime.date.today() - datetime.timedelta(days=checkRange))]
            header.outputCsv(lastResult, "lastResult", header.CRAWL_LIST_PATH)

    except:
        header.EXIT_CODE = -1
        logging.error("爬取主旨列表失敗")
        traceback.print_exc()
    return df
Exemple #4
0
def main(url, checkRange = 30):
    header.processBegin()
    header.clearFolder()
    try:
        df_1 = parsingTitle(url, checkRange)
        if len(df_1) == 0:
            return
        header.outputCsv(df_1, "第一層結果", FinalPath)
        df_2 = parsingDetail(df_1, FinalPath)
        header.outputCsv(df_2, "第二層結果", FinalPath)
        header.RESULT_COUNT = len(df_1)
        header.zipFile()
        header.createInfoFile()
        header.createOKFile()
        header.outputLastResult(df_1, header.lastResult, checkRange)   # 2019-02-01新增產出lastResult方法
    except:
        logging.error("執行爬網作業失敗")
        header.EXIT_CODE = -1
        traceback.print_exc()
    
    header.processEnd()