Esempio n. 1
0
def dataProcess_Detail(soup):
    result = dict()
    # 主旨
    result['title'] = [e.text for e in soup.select("h3")][1]
    # 內容
    result['content'] = [
        header.spaceAndWrapProcess(e.text)
        for e in soup.select('.page_content')
    ][0]
    # 附件
    result['FILES'] = [
        e.get("title").replace("(開啟新視窗)", "") for e in soup.select('.acces a')
    ]
    FILES_NM = [
        os.path.splitext(ele)[0][:30] + os.path.splitext(ele)[1]
        for ele in result['FILES']
    ]
    result['FILES_NM'] = header.processDuplicateFiles(FILES_NM)

    logging.info(result['FILES_NM'])
    result['fileUrls'] = [
        "https://www.ib.gov.tw" + e.get('href')
        for e in soup.select('.acces a')
    ]
    str_content = str(soup.select('.page_content')[0])
    result['serno'] = getResult(re.findall(r'發文字號.+?\d+.+?', str_content), 5)
    result['issue_date'] = getResult(re.findall(r'發文日期.+?日', str_content), 5)

    return result
Esempio n. 2
0
def dataProcess_Detail(soup, title_type):
    result = dict()
    fileUrlRoot = 'http://law.tii.org.tw/Fn/'
    str_content = [e.text for e in soup.select('pre')][0]
    if title_type == '行政函釋':
        contentlist = re.findall('主 旨:.+',
                                 header.spaceAndWrapProcess(str_content))
        if bool(contentlist):
            content = contentlist[0]
        else:
            content = re.findall('一、.+',
                                 header.spaceAndWrapProcess(str_content))[0]
        serno = re.findall(r'發文字號.+?\d+.+', str_content)[0][5:-1]
        issue_date = re.findall(r'發文日期.+?\d+.+', str_content)[0][5:-1]
    else:
        content = header.spaceAndWrapProcess(str_content)
        content1 = re.sub('\s', '', str_content)
        serno = re.findall('日\S+?號[令|函|(公告)]', content1)[0][1:]
        issue_date = re.findall('民國.+?日', str_content)[0]

    result['FILES'] = [e.text for e in soup.select('font font a')]
    FILES_NM = [
        os.path.splitext(ele)[0][:30] + os.path.splitext(ele)[1]
        for ele in result['FILES']
    ]
    result['FILES_NM'] = header.processDuplicateFiles(FILES_NM)
    logging.info(result['FILES_NM'])
    result['fileUrls'] = [
        re.sub(r'\.\/', fileUrlRoot, e.get('href'))
        for e in soup.select('font font a')
    ]
    result['content'] = content
    result['serno'] = serno
    result['issue_date'] = issue_date
    return result
Esempio n. 3
0
def dataProcess_Detail(soup, fileUrlRoot):
    result = dict()
    result['content'] = [header.spaceAndWrapProcess(e.text) for e in soup.select('.page_content')][0]
    result['FILES'] = [e.text for e in soup.select('.acces a')]
    FILES_NM = [os.path.splitext(ele)[0][:30] + os.path.splitext(ele)[1] for ele in result['FILES']]
    result['FILES_NM'] = header.processDuplicateFiles(FILES_NM)
    
    #logging.info(result['FILES_NM'])
    result['fileUrls'] = [fileUrlRoot + e.get('href') for e in soup.select('.acces a')]
    str_content = str(soup.select('.page_content')[0])
    result['serno'] = re.findall(r'發文字號.+?\d+.+?', str_content)[0][5:]
    result['issue_date'] = re.findall(r'發文日期.+?日', str_content)[0][5:]
    return result
Esempio n. 4
0
def dataProcess_Detail(soup, row):
    urlGazette = 'https://gazette.nat.gov.tw' # linkto 行政院公報資訊網
    urlGazette1 = 'http://gazette.nat.gov.tw' # linkto 行政院公報資訊網
    urlmoj = 'https://law.moj.gov.tw/News/' # linkto 全國法規資料庫
    urlfsc = 'http://law.fsc.gov.tw/law/' # linkto 主管法規查詢系統
    title = row['TITL']
    link = row['LNK_URL']
    result = dict()
    if link.find(urlGazette) >= 0 or link.find(urlGazette1) >= 0: 
        fileUrls = [urlGazette + e.get('src') for e in soup.select('.embed-responsive-item')]
        fileNames = [title + '.pdf' for i in range(len(fileUrls))]
        content = ''
        serno = [e.text for e in soup.select('section.Block p span')][2]
        issue_date = soup.select('div.Item section.Block p span')[1].text
    elif link.find(urlmoj) >= 0:
        content = [header.spaceAndWrapProcess(e.text) for e in soup.select('.text-pre')][0]
        fileNames = [e.text for e in soup.select('#litFile a')]
        fileUrls = [urlmoj + '/' + e.get('href') for e in soup.select('#Content a')]
        serno = ''
        issue_date = [e.text for e in soup.select('td')][0]
    elif link.find(urlfsc) >= 0:
        if link.find('DraftOpinion.aspx') >= 0:
            content = '預告日期:' + [e.text.strip() for e in soup.select('td')][1]
            serno = [e.text.strip() for e in soup.select('td')][2]
            issue_date = [e.text.strip() for e in soup.select('td')][0][:9]
            
        elif link.find('NewsContent.aspx') >= 0:
            content1 = [header.spaceAndWrapProcess(e.text) for e in soup.select('.ClearCss')]
            content2 = [header.spaceAndWrapProcess(e.text) for e in soup.select('#ctl00_cp_content_trPreamble td')]
            content = content1[0] if content1 else content2[0]
            serno = [e.text.strip() for e in soup.select('#ctl00_cp_content_trODWord td')][0]
            issue_date = [e.text.strip() for e in soup.select('#ctl00_cp_content_trAnnDate td')][0]
        fileUrls = [urlfsc + e.get('href') for e in soup.select('#ctl00_cp_content_ulAnnFiles02 a')]
        fileNames = [e.text for e in soup.select('#ctl00_cp_content_ulAnnFiles02 a')]   
    
    result['fileUrls'] = fileUrls
    result['FILES'] = fileNames
    FILES_NM = [os.path.splitext(ele)[0][:30] + os.path.splitext(ele)[1] for ele in result['FILES']]
    result['FILES_NM'] = header.processDuplicateFiles(FILES_NM)
    result['content'] = content
    result['serno'] = serno
    result['issue_date'] = issue_date
    return result