def main(checkRange=30): header.processBegin(url=WEB_URL) header.clearFolder() try: # 爬網日期區間 endDate = datetime.date.today() strDate = (endDate - datetime.timedelta(days=checkRange)).isoformat() df_1 = parsingTitle(strDate, endDate) # 確認是否有新資料待爬取 RESULT_COUNT = len(df_1) if RESULT_COUNT < 1: logging.critical("%s 至 %s 間無資料更新" % (strDate, endDate)) else: header.outputCsv(df_1, "第一層結果", FINAL_PATH) df_2 = parsingDetail(df_1) header.outputCsv(df_2, "第二層結果", FINAL_PATH) header.RESULT_COUNT = RESULT_COUNT # 更新 crawlHistory 檔案 header.outputLastResult(df_1, header.lastResult, checkRange) header.zipFile() header.createInfoFile() header.createOKFile() except: header.EXIT_CODE = -1 logging.error("執行爬網作業失敗") logging.error(str(traceback.format_exc())) header.processEnd()
def main(url, checkRange=30): header.processBegin(url=url) header.clearFolder() try: soup = request2soup(url, 1) df_1 = parsingTitle(soup, checkRange) if len(df_1) != 0: header.outputCsv(df_1, "第一層結果") df_2 = parsingDetail(df_1) header.outputCsv(df_2, "第二層結果") header.RESULT_COUNT = len(df_2) header.zipFile() header.createInfoFile() header.createOKFile() header.outputLastResult(df_1, header.lastResult, checkRange) #[2019.02.11]新增產出lastResult方法 except: logging.error("執行爬網作業失敗") traceback.print_exc() header.createInfoFile() header.processEnd()
def parsingTitle(soup, checkRange): try: # 取得上次爬網結果 lastResultPath = header.LAST_RESULT_PATH # +"/lastResult.csv"#[2019.02.11] if os.path.isfile(lastResultPath): lastResult = pd.read_csv(lastResultPath) else: lastResult = pd.DataFrame() header.lastResult = lastResult #[2019.02.11]新增全域變數 # 爬網日期區間為一個禮拜 endDate = datetime.date.today() strDate = (endDate - datetime.timedelta(days=checkRange)).isoformat() df = pd.DataFrame( columns=["WEB_ADDR", "CRL_DATE", "ISS_DATE", "TITL", "LNK_URL"]) soup = request2soup(url) # 資料處理 result = dataProcess_Title(soup, strDate) d = { 'WEB_ADDR': url, 'CRL_DATE': result['crawl_date'], 'ISS_DATE': '', 'TITL': result['titles_result'], 'LNK_URL': result['links'] } df = df.append(pd.DataFrame(data=d)) # 若與上次發文日期和標題相同,則跳至下一筆 if not lastResult.empty: for index, row in df.iterrows(): if row['TITL'] in list(lastResult['TITL']): df.drop(index, inplace=True) if len(df) == 0: logging.critical("%s 至 %s 間無資料更新" % (strDate, endDate)) else: df.index = [i for i in range(df.shape[0])] # reset lastResult = lastResult.append(df) lastResult.index = [i for i in range(lastResult.shape[0])] # reset lastResult = lastResult[pd.to_datetime(lastResult['CRL_DATE']) >= ( datetime.date.today() - datetime.timedelta(days=checkRange))] header.outputCsv(lastResult, "lastResult", header.CRAWL_LIST_PATH) except: header.EXIT_CODE = -1 logging.error("爬取主旨列表失敗") traceback.print_exc() return df
def main(url, checkRange = 30): header.processBegin() header.clearFolder() try: df_1 = parsingTitle(url, checkRange) if len(df_1) == 0: return header.outputCsv(df_1, "第一層結果", FinalPath) df_2 = parsingDetail(df_1, FinalPath) header.outputCsv(df_2, "第二層結果", FinalPath) header.RESULT_COUNT = len(df_1) header.zipFile() header.createInfoFile() header.createOKFile() header.outputLastResult(df_1, header.lastResult, checkRange) # 2019-02-01新增產出lastResult方法 except: logging.error("執行爬網作業失敗") header.EXIT_CODE = -1 traceback.print_exc() header.processEnd()