def dataProcess_Detail(soup): result = dict() # 主旨 result['title'] = [e.text for e in soup.select("h3")][1] # 內容 result['content'] = [ header.spaceAndWrapProcess(e.text) for e in soup.select('.page_content') ][0] # 附件 result['FILES'] = [ e.get("title").replace("(開啟新視窗)", "") for e in soup.select('.acces a') ] FILES_NM = [ os.path.splitext(ele)[0][:30] + os.path.splitext(ele)[1] for ele in result['FILES'] ] result['FILES_NM'] = header.processDuplicateFiles(FILES_NM) logging.info(result['FILES_NM']) result['fileUrls'] = [ "https://www.ib.gov.tw" + e.get('href') for e in soup.select('.acces a') ] str_content = str(soup.select('.page_content')[0]) result['serno'] = getResult(re.findall(r'發文字號.+?\d+.+?', str_content), 5) result['issue_date'] = getResult(re.findall(r'發文日期.+?日', str_content), 5) return result
def dataProcess_Detail(soup, title_type): result = dict() fileUrlRoot = 'http://law.tii.org.tw/Fn/' str_content = [e.text for e in soup.select('pre')][0] if title_type == '行政函釋': contentlist = re.findall('主 旨:.+', header.spaceAndWrapProcess(str_content)) if bool(contentlist): content = contentlist[0] else: content = re.findall('一、.+', header.spaceAndWrapProcess(str_content))[0] serno = re.findall(r'發文字號.+?\d+.+', str_content)[0][5:-1] issue_date = re.findall(r'發文日期.+?\d+.+', str_content)[0][5:-1] else: content = header.spaceAndWrapProcess(str_content) content1 = re.sub('\s', '', str_content) serno = re.findall('日\S+?號[令|函|(公告)]', content1)[0][1:] issue_date = re.findall('民國.+?日', str_content)[0] result['FILES'] = [e.text for e in soup.select('font font a')] FILES_NM = [ os.path.splitext(ele)[0][:30] + os.path.splitext(ele)[1] for ele in result['FILES'] ] result['FILES_NM'] = header.processDuplicateFiles(FILES_NM) logging.info(result['FILES_NM']) result['fileUrls'] = [ re.sub(r'\.\/', fileUrlRoot, e.get('href')) for e in soup.select('font font a') ] result['content'] = content result['serno'] = serno result['issue_date'] = issue_date return result
def dataProcess_Detail(soup, fileUrlRoot): result = dict() result['content'] = [header.spaceAndWrapProcess(e.text) for e in soup.select('.page_content')][0] result['FILES'] = [e.text for e in soup.select('.acces a')] FILES_NM = [os.path.splitext(ele)[0][:30] + os.path.splitext(ele)[1] for ele in result['FILES']] result['FILES_NM'] = header.processDuplicateFiles(FILES_NM) #logging.info(result['FILES_NM']) result['fileUrls'] = [fileUrlRoot + e.get('href') for e in soup.select('.acces a')] str_content = str(soup.select('.page_content')[0]) result['serno'] = re.findall(r'發文字號.+?\d+.+?', str_content)[0][5:] result['issue_date'] = re.findall(r'發文日期.+?日', str_content)[0][5:] return result
def dataProcess_Detail(soup, row): urlGazette = 'https://gazette.nat.gov.tw' # linkto 行政院公報資訊網 urlGazette1 = 'http://gazette.nat.gov.tw' # linkto 行政院公報資訊網 urlmoj = 'https://law.moj.gov.tw/News/' # linkto 全國法規資料庫 urlfsc = 'http://law.fsc.gov.tw/law/' # linkto 主管法規查詢系統 title = row['TITL'] link = row['LNK_URL'] result = dict() if link.find(urlGazette) >= 0 or link.find(urlGazette1) >= 0: fileUrls = [urlGazette + e.get('src') for e in soup.select('.embed-responsive-item')] fileNames = [title + '.pdf' for i in range(len(fileUrls))] content = '' serno = [e.text for e in soup.select('section.Block p span')][2] issue_date = soup.select('div.Item section.Block p span')[1].text elif link.find(urlmoj) >= 0: content = [header.spaceAndWrapProcess(e.text) for e in soup.select('.text-pre')][0] fileNames = [e.text for e in soup.select('#litFile a')] fileUrls = [urlmoj + '/' + e.get('href') for e in soup.select('#Content a')] serno = '' issue_date = [e.text for e in soup.select('td')][0] elif link.find(urlfsc) >= 0: if link.find('DraftOpinion.aspx') >= 0: content = '預告日期:' + [e.text.strip() for e in soup.select('td')][1] serno = [e.text.strip() for e in soup.select('td')][2] issue_date = [e.text.strip() for e in soup.select('td')][0][:9] elif link.find('NewsContent.aspx') >= 0: content1 = [header.spaceAndWrapProcess(e.text) for e in soup.select('.ClearCss')] content2 = [header.spaceAndWrapProcess(e.text) for e in soup.select('#ctl00_cp_content_trPreamble td')] content = content1[0] if content1 else content2[0] serno = [e.text.strip() for e in soup.select('#ctl00_cp_content_trODWord td')][0] issue_date = [e.text.strip() for e in soup.select('#ctl00_cp_content_trAnnDate td')][0] fileUrls = [urlfsc + e.get('href') for e in soup.select('#ctl00_cp_content_ulAnnFiles02 a')] fileNames = [e.text for e in soup.select('#ctl00_cp_content_ulAnnFiles02 a')] result['fileUrls'] = fileUrls result['FILES'] = fileNames FILES_NM = [os.path.splitext(ele)[0][:30] + os.path.splitext(ele)[1] for ele in result['FILES']] result['FILES_NM'] = header.processDuplicateFiles(FILES_NM) result['content'] = content result['serno'] = serno result['issue_date'] = issue_date return result