Exemple #1
0
class Spider(scrapy.Spider):
    name = "meeting"
    allowed_domains = ["tncc.gov.tw"]
    start_urls = ["http://www.tncc.gov.tw",]
    download_delay = 0.5
    county_abbr = os.path.dirname(os.path.realpath(__file__)).split('/')[-1]
    election_year = common.election_year(county_abbr)
    output_path = common.meeting_minutes_output_path(county_abbr, election_year)
    ads = {'2010': 1, '2014': 2, '2018': 3}
    ad = ads[election_year]

    def parse(self, response):
        return response.follow(response.xpath(u'//a[re:test(., "^出版品$")]/@href').extract_first(), callback=self.parse_list)

    def parse_list(self, response):
        for tr in response.css('#table2 tr'):
            link = tr.xpath(u'descendant::a[re:test(., "^第%d屆")]/@href' % self.ad).extract_first()
            if link:
                item = {}
                item['election_year'] = self.election_year
                item['date'] = common.ROC2AD(tr.xpath('td[1]/text()').extract_first())
                item['meeting'] = tr.xpath('td[3]/descendant::a/text()').extract_first()
                item['meeting'] = item['meeting'].replace('.', u'、')
                item['download_url'] = urljoin(response.url, link)
                ext = item['download_url'].split('.')[-1]
                file_name = '%s.%s' % (item['meeting'], ext)
                cmd = 'mkdir -p %s && wget --no-check-certificate -c -O %s%s "%s"' % (self.output_path, self.output_path, file_name, item['download_url'])
                retcode = subprocess.call(cmd, shell=True)
                yield item
class Spider(scrapy.Spider):
    name = "meeting"
    allowed_domains = ["www.tycc.gov.tw"]
    start_urls = [
        "http://www.tycc.gov.tw/content/public/public_main.aspx?wtp=1&wnd=217",
    ]
    download_delay = 0.5
    county_abbr = os.path.dirname(os.path.realpath(__file__)).split('/')[-1]
    election_year = common.election_year(county_abbr)
    output_path = common.meeting_minutes_output_path(county_abbr,
                                                     election_year)
    election_years_ad = {'2014': '1', '2010': '17'}
    ad = election_years_ad[election_year]

    def parse(self, response):
        nodes = response.xpath(
            u'//tr/td[re:test(@title, "第%s屆")]/following-sibling::td/a[re:test(., "會$")]'
            % self.ad)
        for node in nodes:
            item = {}
            item['election_year'] = self.election_year
            item['download_url'] = urljoin(
                response.url,
                node.xpath('@href').extract_first().strip())
            item['sitting'] = u'第%s屆' % self.ad
            item['meeting'] = node.xpath('descendant::*/text()').re(
                u'屆(.+會)$')[0]
            item['meeting'] = item['meeting'].replace('.', u'、')
            ext = node.xpath('@href').extract_first().split('.')[-1]
            file_name = '%s_%s.%s' % (item['sitting'], item['meeting'], ext)
            cmd = 'mkdir -p %s && wget -c -O %s%s "%s"' % (
                self.output_path, self.output_path, file_name,
                item['download_url'])
            retcode = subprocess.call(cmd, shell=True)
            yield item
        nodes = response.xpath(
            u'//tr/td[re:test(@title, "第%s屆")]/following-sibling::td/a[re:test(., "(冊|pdf)$")]'
            % self.ad)
        for node in nodes:
            item = {}
            item['election_year'] = self.election_year
            item['download_url'] = urljoin(
                response.url,
                node.xpath('@href').extract_first().strip())
            item['sitting'] = u'第%s屆' % self.ad
            item['meeting'] = '%s%s' % (
                node.xpath('preceding::td[1]/text()').re(u'屆(.+會)')[0],
                node.xpath('descendant::*/text()').re(u'(.+冊)')[0])
            item['meeting'] = item['meeting'].replace('.', u'、')
            ext = node.xpath('@href').extract_first().split('.')[-1]
            file_name = '%s_%s.%s' % (item['sitting'], item['meeting'], ext)
            cmd = 'mkdir -p %s && wget -c -O %s%s "%s"' % (
                self.output_path, self.output_path, file_name,
                item['download_url'])
            retcode = subprocess.call(cmd, shell=True)
            yield item
Exemple #3
0
class Spider(scrapy.Spider):
    name = "meeting"
    allowed_domains = ["cissearch.kcc.gov.tw"]
    start_urls = [
        "http://cissearch.kcc.gov.tw/System/MeetingRecord/Default.aspx",
    ]
    download_delay = 0.5
    county_abbr = os.path.dirname(os.path.realpath(__file__)).split('/')[-1]
    election_year = common.election_year(county_abbr)
    output_path = common.meeting_minutes_output_path(county_abbr,
                                                     election_year)

    def parse(self, response):
        count = response.xpath(
            '//span[@id="ContentPlaceHolder1_DataPager1"]/text()').re(
                u'共\s*(\d+)\s*筆')[0]
        payload = {
            'ctl00$ContentPlaceHolder1$DataPager1$ctl02$txtPageSize': count
        }
        yield scrapy.FormRequest.from_response(response,
                                               response.url,
                                               formdata=payload,
                                               callback=self.parse_profile,
                                               dont_filter=True)

    def parse_profile(self, response):
        trs = response.xpath('//table[@id="ContentPlaceHolder1_gvIndex"]/tr')
        for tr in trs:
            item = {}
            tds = tr.xpath('td')
            if tds:
                item['election_year'] = self.election_year
                item['date'] = common.ROC2AD(
                    tds[1].xpath('text()').extract_first())
                meeting = tds[2].xpath('text()').extract_first()
                item['meeting'] = tds[2].xpath('text()').re(
                    u'(.+?)[紀記][錄錄]')[0]
                item['download_url'] = urljoin(
                    response.url,
                    tds[3].xpath('a/@href').extract_first().strip())
                ext = item['download_url'].split('.')[-1]
                file_name = '%s.%s' % (item['meeting'], ext)
                cmd = 'mkdir -p %s && wget -c -O %s%s "%s"' % (
                    self.output_path, self.output_path, file_name,
                    item['download_url'])
                retcode = subprocess.call(cmd, shell=True)
                time.sleep(1)
                yield item
class Spider(scrapy.Spider):
    name = "meeting"
    allowed_domains = ["www.kmc.gov.tw", "ebook.21cms.tw"]
    start_urls = ["http://www.kmc.gov.tw/recorder",]
    download_delay = 0.5
    county_abbr = os.path.dirname(os.path.realpath(__file__)).split('/')[-1]
    election_year = common.election_year(county_abbr)
    output_path = common.meeting_minutes_output_path(county_abbr, election_year)
    ads = {
        '2014': u'第十八屆',
        '2009': u'第十七屆'
    }
    ad = ads[election_year]

    def parse(self, response):
        nodes = response.css('.panel-body').xpath(u'descendant::a[re:test(., "%s")]' % self.ad)
        for node in nodes:
            link = node.xpath('@href').extract_first()
            item = {}
            item['election_year'] = self.election_year
            item['sitting'] = node.xpath('text()').extract_first().replace(u'(點擊閱讀)', '').replace('>>', '')
            item['download_url'] = urljoin(response.url, link)
            if re.search('/ebook/', link):
                file_name = '%s.pdf' % (item['sitting'], )
                cmd = 'mkdir -p %s && wget -A pdf -nd -r --no-parent -O "%s%s" "%s"' % (self.output_path, self.output_path, file_name, urljoin(response.url, link))
                retcode = subprocess.call(cmd, shell=True)
                yield item
            else:
                yield response.follow(link, callback=self.parse_iframe, meta={'item': item})

    def parse_iframe(self, response):
        link = response.css('.article-content iframe').xpath('@src').extract_first()
        item = response.meta['item']
        file_name = '%s.pdf' % (item['sitting'], )
        cmd = 'mkdir -p %s && wget -A pdf -nd -r --no-parent -O "%s%s" "%s"' % (self.output_path, self.output_path, file_name, link)
        retcode = subprocess.call(cmd, shell=True)
        yield item
Exemple #5
0
class Spider(scrapy.Spider):
    name = "meeting"
    allowed_domains = ["www.ilcc.gov.tw"]
    start_urls = [
        "http://www.ilcc.gov.tw/Html/H_06/H_06.asp",
    ]
    download_delay = 0.5
    county_abbr = os.path.dirname(os.path.realpath(__file__)).split('/')[-1]
    election_year = common.election_year(county_abbr)
    output_path = common.meeting_minutes_output_path(county_abbr,
                                                     election_year)
    payload = {
        'ddlcounciltype': u'大會',
    }

    def parse(self, response):
        return response.follow(
            response.xpath(u'//area[@alt="會議紀錄"]/@href').extract_first(),
            callback=self.parse_frame)

    def parse_frame(self, response):
        return response.follow(
            response.xpath('//frame[@id="FrMain"]/@src').extract_first(),
            callback=self.parse_meeting_info)

    def parse_meeting_info(self, response):
        return scrapy.FormRequest.from_response(response,
                                                response.url,
                                                formdata=self.payload,
                                                callback=self.parse_pages)

    def parse_pages(self, response):
        pages = response.xpath(
            '//select[@name="page"]/option/@value').extract()
        for page in pages:
            yield scrapy.FormRequest.from_response(response,
                                                   response.url,
                                                   formdata={
                                                       'page': page,
                                                       'btSearch': None
                                                   },
                                                   callback=self.parse_post)

    def parse_post(self, response):
        trs = response.xpath('//table[@id="dg"]/descendant::tr[position()>1]')
        for tr in trs:
            item = {}
            item['election_year'] = self.election_year
            item['date'] = re.sub('\s', '',
                                  tr.xpath('string(td[1])').extract_first())
            item['sitting'] = re.sub(
                '\s', '', '%s%s' % (tr.xpath('string(td[2])').extract_first(),
                                    tr.xpath('string(td[3])').extract_first()))
            item['meeting'] = re.sub('\s', '',
                                     tr.xpath('string(td[5])').extract_first())
            yield response.follow(
                tr.xpath('td[4]/descendant::a/@href').extract_first(),
                callback=self.parse_profile,
                meta={'item': item})

    def parse_profile(self, response):
        item = response.meta['item']
        item['download_url'] = response.xpath(
            '//td/a[@target="_blank"]/@href').extract_first()
        if item['download_url']:
            ext = re.search(u'\.(\w+)$', item['download_url']).group(1)
            file_name = '%s_%s.%s' % (item['sitting'], item['meeting'], ext)
            cmd = 'mkdir -p %s && wget -c -O %s%s "%s"' % (
                self.output_path, self.output_path, file_name,
                item['download_url'])
            retcode = subprocess.call(cmd, shell=True)
        else:
            logging.error(response.url)
        return item
class Spider(scrapy.Spider):
    name = "meeting"
    allowed_domains = ["www.ntp.gov.tw"]
    start_urls = [
        'https://www.ntp.gov.tw/content/information/information04.aspx'
    ]
    download_delay = 1
    county_abbr = os.path.dirname(os.path.realpath(__file__)).split('/')[-1]
    election_year = common.election_year(county_abbr)
    output_path = common.meeting_minutes_output_path(county_abbr,
                                                     election_year)

    def parse(self, response):
        for node in response.xpath(
                u'//a[contains(@title, "HTML檔")]/@href').extract():
            yield response.follow(node, callback=self.parse_sitting)

    def parse_sitting(self, response):
        for node in response.xpath(u'//td/descendant::a/@href').extract():
            yield response.follow(node, callback=self.parse_meeting)

    def parse_meeting(self, response):
        try:
            sitting = response.xpath('//text()').re(u'(.+)日程表')[0]
            trs = [
                tr for tr in response.xpath('//table/descendant::tr')
                if tr.xpath('td[3]/text()').re('\d+')
            ]
            for tr in trs:
                item = {}
                item['election_year'] = self.election_year
                item['date'] = common.ROC2AD(
                    tr.xpath('td[1]/text()').extract_first())
                item['sitting'] = sitting
                item['meeting'] = tr.xpath('td[3]/text()').extract_first()
                item['download_url'] = tr.xpath(
                    'td[6]/descendant::a[1]/@href').extract_first()
                ext = item['download_url'].split('.')[-1]
                file_name = '%s_%s.%s' % (item['sitting'], item['meeting'],
                                          ext)
                if ext == 'pdf':
                    yield response.follow(item['download_url'],
                                          callback=self.download_pdf,
                                          meta={
                                              'item': item,
                                              'file_name': file_name
                                          })
                elif ext == 'htm':
                    yield response.follow(item['download_url'],
                                          callback=self.parse_html,
                                          meta={
                                              'item': item,
                                              'file_name': file_name
                                          })
        except scrapy.exceptions.NotSupported:
            pass

    def download_pdf(self, response):
        item = response.meta['item']
        item['download_url'] = response.url
        cmd = 'mkdir -p %s && wget --no-check-certificate -c -O %s%s "%s"' % (
            self.output_path, self.output_path, response.meta['file_name'],
            item['download_url'])
        retcode = subprocess.call(cmd, shell=True)
        return item

    def parse_html(self, response):
        item = response.meta['item']
        item['download_url'] = response.url
        text = '\n'.join(response.xpath('//pre/text()').extract())
        write_file(
            text, '%s%s_%s.txt' %
            (self.output_path, item['sitting'], item['meeting']))
        return item
Exemple #7
0
class Spider(scrapy.Spider):
    name = "meeting"
    allowed_domains = ["obas_front.tcc.gov.tw"]
    start_urls = [
        "http://obas_front.tcc.gov.tw:8080/Agenda/EFileSearch.aspx?FileGrpKind=2&h=600",
    ]
    download_delay = 0.5
    county_abbr = os.path.dirname(os.path.realpath(__file__)).split('/')[-1]
    election_year = common.election_year(county_abbr)
    output_path = common.meeting_minutes_output_path(county_abbr,
                                                     election_year)
    payload = {
        'btnCongress': u'大會',
        'txtPageSize': u'300',
    }

    def parse(self, response):
        return scrapy.FormRequest.from_response(response,
                                                response.url,
                                                formdata=self.payload,
                                                callback=self.parse_post)

    def parse_post(self, response):
        links = response.xpath(
            '//table/tr/td/a[contains(@href, "EFileDetail.aspx")]/@href'
        ).extract()
        for link in links:
            yield response.follow(link, callback=self.parse_profile)

    def parse_profile(self, response):
        item = {}
        item['election_year'] = self.election_year
        nodes = response.xpath('//table/tbody/tr')
        ref = {
            u'屆別': {
                'key': 'sitting',
                'path': 'td/span/text()'
            },
            u'類別': {
                'key': 'category',
                'path': 'td/span/text()'
            },
            u'日期': {
                'key': 'date',
                'path': 'td/span/text()'
            },
            u'資料名稱': {
                'key': 'meeting',
                'path': 'td/span/text()'
            },
            u'檔案': {
                'key': 'download_url',
                'path': 'td/a/@href',
                'extra': 'http://obas_front.tcc.gov.tw:8080/Agenda/'
            },
        }
        for node in nodes:
            value = ref.get(node.xpath('th/text()').extract_first().strip())
            if value:
                item[value['key']] = '%s%s' % (value.get(
                    'extra', ''), node.xpath(value['path']).extract_first())
        item['date'] = common.ROC2AD(item['date'])
        ext = re.search(u'FileName=[\w\d]+\.(\w+)&',
                        item['download_url']).group(1)
        file_name = '%s_%s.%s' % (item['sitting'], item['meeting'], ext)
        cmd = 'mkdir -p %s && wget -c -O %s%s "%s"' % (
            self.output_path, self.output_path, file_name,
            item['download_url'])
        retcode = subprocess.call(cmd, shell=True)
        return item