class Spider(scrapy.Spider): name = "meeting" allowed_domains = ["tncc.gov.tw"] start_urls = ["http://www.tncc.gov.tw",] download_delay = 0.5 county_abbr = os.path.dirname(os.path.realpath(__file__)).split('/')[-1] election_year = common.election_year(county_abbr) output_path = common.meeting_minutes_output_path(county_abbr, election_year) ads = {'2010': 1, '2014': 2, '2018': 3} ad = ads[election_year] def parse(self, response): return response.follow(response.xpath(u'//a[re:test(., "^出版品$")]/@href').extract_first(), callback=self.parse_list) def parse_list(self, response): for tr in response.css('#table2 tr'): link = tr.xpath(u'descendant::a[re:test(., "^第%d屆")]/@href' % self.ad).extract_first() if link: item = {} item['election_year'] = self.election_year item['date'] = common.ROC2AD(tr.xpath('td[1]/text()').extract_first()) item['meeting'] = tr.xpath('td[3]/descendant::a/text()').extract_first() item['meeting'] = item['meeting'].replace('.', u'、') item['download_url'] = urljoin(response.url, link) ext = item['download_url'].split('.')[-1] file_name = '%s.%s' % (item['meeting'], ext) cmd = 'mkdir -p %s && wget --no-check-certificate -c -O %s%s "%s"' % (self.output_path, self.output_path, file_name, item['download_url']) retcode = subprocess.call(cmd, shell=True) yield item
class Spider(scrapy.Spider): name = "bills" allowed_domains = ["tccc.gov.tw", ] start_urls = ["http://proposal.tccc.gov.tw/test/"] download_delay = 0.5 county_abbr = os.path.dirname(os.path.realpath(__file__)).split('/')[-1] election_year = common.election_year(county_abbr) ads = {'2010': 1, '2014': 2, '2018': 3} ad = ads[election_year] def parse(self, response): payload = dict(zip(['account', 'pw'], response.xpath('//text()').re(u'帳號:(\S*) 密碼:(\S*)'))) return scrapy.FormRequest.from_response(response, formname='form1', formdata=payload, callback=self.parse_logined) def parse_logined(self, response): for link in response.xpath(u'//a[re:test(., "提案查詢$")]/@href'): yield response.follow(link, callback=self.parse_query, meta={'dont_redirect': True}) def parse_query(self, response): for value in response.xpath(u'//select[@name="SPeriod"]/optgroup[re:test(., "%s屆")]/option/@value' % self.ad).extract(): payload = {'SPeriod': value} yield scrapy.FormRequest.from_response(response, formname='form1', formdata=payload, callback=self.parse_list) def parse_list(self, response): for node in response.xpath('//*[count(td)=11][position()>1]'): item = {} item['election_year'] = self.election_year item['id'] = node.xpath('td[1]/input/@value').extract_first().zfill(6) item['category'] = node.xpath('td[3]/text()').extract_first().split('-')[-1].strip() item['type'] = re.sub('\s', '', node.xpath('td[4]/text()').extract_first()) item['proposed_by'] = node.xpath('td[5]/text()').extract_first().strip().split(u'、') item['petitioned_by'] = node.xpath('td[6]/text()').extract_first().strip().split(u'、') if node.xpath('td[6]/text()').extract_first() else [] item['bill_no'] = re.sub('\s', '', node.xpath('td[7]/text()').extract_first()) item['abstract'] = node.xpath('td[8]/descendant-or-self::*/text()').extract_first() item['execution'] = re.sub('\s', '', node.xpath('td[10]/text()').extract_first()) link = urljoin(response.url, 'html_e_print.php?id=%s' % item['id']) yield response.follow(link, callback=self.parse_profile, meta={'item': item}) next_page = response.xpath(u'//a[re:test(., "下一頁")]/@href').extract_first() if next_page: yield response.follow(next_page, callback=self.parse_list) def parse_profile(self, response): item = response.meta['item'] item['description'] = response.xpath(u'//td[re:test(., "理[\s ]*由")]/following-sibling::td/descendant-or-self::*/text()').extract_first() item['methods'] = response.xpath(u'//td[re:test(., "辦[\s ]*法")]/following-sibling::td/descendant-or-self::*/text()').extract_first() item['motions'] = [] for motion in [u'審查意見', u'大會議決']: resolution = response.xpath(u'//td[re:test(., "%s")]/following-sibling::td/descendant-or-self::*/text()' % u'[\s ]*'.join(motion)).extract_first() if resolution: item['motions'].append(dict(zip(['motion', 'resolution', 'date'], [motion, resolution, None]))) item['links'] = [ { 'url': response.url, 'note': 'original' }, ] yield item
class Spider(scrapy.Spider): name = "bills" allowed_domains = ["ntp.gov.tw"] start_urls = ['http://www.ntp.gov.tw/index.aspx?FType=mb'] download_delay = 0.5 county_abbr = os.path.dirname(os.path.realpath(__file__)).split('/')[-1] election_year = common.election_year(county_abbr) ads = {'2010': 1, '2014': 2, '2018': 3} ad = ads[election_year] def parse(self, response): return response.follow(response.xpath(u'//img[@alt="議事資訊"]/parent::a/@href').extract_first(), callback=self.parse_tab) def parse_tab(self, response): return response.follow(response.xpath(u'//a[re:test(., "議案查詢")]/@href').extract_first(), callback=self.parse_frame) def parse_frame(self, response): return response.follow(response.xpath(u'//iframe[@title="議案查詢介面"]/@src').extract_first(), callback=self.parse_query) def parse_query(self, response): payload = {'dd_MJ': response.xpath(u'//select[@name="dd_MJ"]/option[re:test(., "%s")]/@value' % self.ad).extract_first()} return scrapy.FormRequest.from_response(response, formname='Form1', formdata=payload, callback=self.parse_list) def parse_list(self, response): for node in response.xpath('//table[@id="dg_List"]/tr[position()>1 and position()<last()]'): yield response.follow(node.xpath('td[1]/input/@onclick').re(u"open\('(.*?=\d+)")[0], callback=self.parse_profile) if response.css('.MultiPageButtonFont span::text').re('1$'): payload = {name: None for name in response.xpath('////input[not(@type="hidden")]/@name').extract()} for page in response.css('.MultiPageButtonFont').xpath('descendant::span[1]/following-sibling::a'): payload['__EVENTTARGET'] = page.xpath('@href').re("doPostBack\('([^']*)'")[0] yield scrapy.FormRequest.from_response(response, formdata=payload, callback=self.parse_list, dont_filter=True, dont_click=True, headers=common.headers(self.county_abbr)) def parse_profile(self, response): item = {} item['election_year'] = self.election_year item['id'] = re.search(u'BillNO=(\d+)', response.url).group(1).zfill(6) item['type'] = response.xpath('//span[@id="lab_BillType"]/text()').extract_first().strip() if response.xpath('//span[@id="lab_BillType"]/text()').extract() else '' item['category'] = response.xpath('//span[@id="lab_BillClass"]/text()').extract_first().strip() if response.xpath('//span[@id="lab_BillClass"]/text()').extract() else '' item['proposed_by'] = response.xpath('//span[@id="lab_Provider"]/text()').extract_first().strip().split(u',') if response.xpath('//span[@id="lab_Provider"]/text()').extract() else [] item['petitioned_by'] = (response.xpath('//span[@id="lab_SupportMan"]/text()').extract_first() or '').strip().split(u',') item['abstract'] = '\n'.join([re.sub('\s', '', x) for x in response.xpath('//span[@id="lab_Reason"]/div//text()').extract()]) item['description'] = '\n'.join([re.sub('\s', '', x) for x in response.xpath('//span[@id="lab_Description"]/div//text()').extract()]) item['methods'] = '\n'.join([re.sub('\s', '', x) for x in response.xpath('//span[@id="lab_Method"]/div/text()').extract()]) motions = [] for motion, id in [(u'市府回覆', 'dg_Response__ctl2_lab_dgReplyDesc'), (u'一讀決議', 'lab_OneResult'), (u'審查意見', 'lab_ExamResult'), (u'大會決議', 'lab_Result'), (u'二讀決議', 'lab_TwoResult'), (u'三讀決議', 'lab_ThreeResult'), ]: content = '\n'.join([re.sub('\s', '', x) for x in response.xpath('//span[@id="%s"]//text()' % id).extract()]) if content: motions.append(dict(zip(['motion', 'resolution', 'date'], [motion, content, None]))) item['motions'] = motions item['links'] = [ { 'url': response.url, 'note': 'original' } ] return item
class Spider(scrapy.Spider): name = "meeting" allowed_domains = ["www.tycc.gov.tw"] start_urls = [ "http://www.tycc.gov.tw/content/public/public_main.aspx?wtp=1&wnd=217", ] download_delay = 0.5 county_abbr = os.path.dirname(os.path.realpath(__file__)).split('/')[-1] election_year = common.election_year(county_abbr) output_path = common.meeting_minutes_output_path(county_abbr, election_year) election_years_ad = {'2014': '1', '2010': '17'} ad = election_years_ad[election_year] def parse(self, response): nodes = response.xpath( u'//tr/td[re:test(@title, "第%s屆")]/following-sibling::td/a[re:test(., "會$")]' % self.ad) for node in nodes: item = {} item['election_year'] = self.election_year item['download_url'] = urljoin( response.url, node.xpath('@href').extract_first().strip()) item['sitting'] = u'第%s屆' % self.ad item['meeting'] = node.xpath('descendant::*/text()').re( u'屆(.+會)$')[0] item['meeting'] = item['meeting'].replace('.', u'、') ext = node.xpath('@href').extract_first().split('.')[-1] file_name = '%s_%s.%s' % (item['sitting'], item['meeting'], ext) cmd = 'mkdir -p %s && wget -c -O %s%s "%s"' % ( self.output_path, self.output_path, file_name, item['download_url']) retcode = subprocess.call(cmd, shell=True) yield item nodes = response.xpath( u'//tr/td[re:test(@title, "第%s屆")]/following-sibling::td/a[re:test(., "(冊|pdf)$")]' % self.ad) for node in nodes: item = {} item['election_year'] = self.election_year item['download_url'] = urljoin( response.url, node.xpath('@href').extract_first().strip()) item['sitting'] = u'第%s屆' % self.ad item['meeting'] = '%s%s' % ( node.xpath('preceding::td[1]/text()').re(u'屆(.+會)')[0], node.xpath('descendant::*/text()').re(u'(.+冊)')[0]) item['meeting'] = item['meeting'].replace('.', u'、') ext = node.xpath('@href').extract_first().split('.')[-1] file_name = '%s_%s.%s' % (item['sitting'], item['meeting'], ext) cmd = 'mkdir -p %s && wget -c -O %s%s "%s"' % ( self.output_path, self.output_path, file_name, item['download_url']) retcode = subprocess.call(cmd, shell=True) yield item
class Spider(scrapy.Spider): name = "bills" allowed_domains = ["taitungcc.gov.tw", ] start_urls = ["http://www.taitungcc.gov.tw"] download_delay = 0.5 county_abbr = os.path.dirname(os.path.realpath(__file__)).split('/')[-1] election_year = common.election_year(county_abbr) ads = {'2009': 17, '2014': 18, '2018': 19} ad = ads[election_year] def parse(self, response): return response.follow(response.xpath(u'//a[re:test(., "^議案查詢$")]/@href').extract_first(), callback=self.parse_list) def parse_list(self, response): pages = response.css('#BodyContent_PageHelpWuc1_lbTotalInFo::text').extract_first() print pages for node in response.css('table.list3 tbody tr'): node_ad = int(node.xpath('td[1]/text()').re(u'(\d+)\s*屆')[0]) if node_ad < self.ad: break if node_ad > self.ad: continue yield response.follow(node.xpath('@onclick').re("href='(.*)'")[0], callback=self.parse_profile) next_page = response.xpath(u'//input[re:test(@value, "下一頁")][not(@disabled)]') if next_page: payload = {next_page.xpath('@name').extract_first(): next_page.xpath('@value').extract_first()} yield scrapy.FormRequest.from_response(response, formdata=payload, callback=self.parse_list, dont_filter=True, dont_click=True, headers=common.headers(self.county_abbr)) def parse_profile(self, response): item = {} item['election_year'] = self.election_year item['id'] = re.search('=([^&]*)', response.url).group(1).zfill(6) item['bill_id'] = response.xpath(u'(//td[re:test(., "^案[\s ]*號$")]/following-sibling::td)[1]/text()').extract_first() item['category'] = re.search(u'.*?類', response.xpath(u'(//td[re:test(., "^案[\s ]*號$")]/following-sibling::td)[1]/text()').extract_first()).group(0) for key, label in [('type', u'議案分類'), ('abstract', u'案由'), ('description', u'說明'), ('methods', u'辦法')]: content = response.xpath(u'(//td[re:test(., "^%s$")]/following-sibling::td)[1]/text()' % label).extract_first() if content: item[key] = content.strip() item['proposed_by'] = re.sub(u'(副?議長|議員)', '', response.xpath(u'(//td[re:test(., "^(動議|提案|請願)(單位|人)(姓名)?$")]/following-sibling::td)[1]/text()').extract_first()).strip().split(u'、') item['petitioned_by'] = re.sub(u'(副?議長|議員)', '', (response.xpath(u'(//td[re:test(., "^(連署|附議)人$")]/following-sibling::td)[1]/text()').extract_first() or '')).strip().split(u'、') item['motions'] = [] for motion in [u'審查意見', u'大會決議']: resolution = response.xpath(u'(//td[re:test(., "^%s$")]/following-sibling::td)[1]/text()' % motion).extract_first() if resolution: item['motions'].append(dict(zip(['motion', 'resolution', 'date'], [motion, resolution.strip(), None]))) item['links'] = [ { 'url': response.url, 'note': 'original' } ] return item
class Spider(scrapy.Spider): name = "meeting" allowed_domains = ["cissearch.kcc.gov.tw"] start_urls = [ "http://cissearch.kcc.gov.tw/System/MeetingRecord/Default.aspx", ] download_delay = 0.5 county_abbr = os.path.dirname(os.path.realpath(__file__)).split('/')[-1] election_year = common.election_year(county_abbr) output_path = common.meeting_minutes_output_path(county_abbr, election_year) def parse(self, response): count = response.xpath( '//span[@id="ContentPlaceHolder1_DataPager1"]/text()').re( u'共\s*(\d+)\s*筆')[0] payload = { 'ctl00$ContentPlaceHolder1$DataPager1$ctl02$txtPageSize': count } yield scrapy.FormRequest.from_response(response, response.url, formdata=payload, callback=self.parse_profile, dont_filter=True) def parse_profile(self, response): trs = response.xpath('//table[@id="ContentPlaceHolder1_gvIndex"]/tr') for tr in trs: item = {} tds = tr.xpath('td') if tds: item['election_year'] = self.election_year item['date'] = common.ROC2AD( tds[1].xpath('text()').extract_first()) meeting = tds[2].xpath('text()').extract_first() item['meeting'] = tds[2].xpath('text()').re( u'(.+?)[紀記][錄錄]')[0] item['download_url'] = urljoin( response.url, tds[3].xpath('a/@href').extract_first().strip()) ext = item['download_url'].split('.')[-1] file_name = '%s.%s' % (item['meeting'], ext) cmd = 'mkdir -p %s && wget -c -O %s%s "%s"' % ( self.output_path, self.output_path, file_name, item['download_url']) retcode = subprocess.call(cmd, shell=True) time.sleep(1) yield item
class Spider(scrapy.Spider): name = "bills" allowed_domains = ["ptcc.gov.tw", ] start_urls = ["http://www.ptcc.gov.tw"] download_delay = 0.5 county_abbr = os.path.dirname(os.path.realpath(__file__)).split('/')[-1] election_year = common.election_year(county_abbr) ads = {'2005': 16, '2009': 17, '2014': 18, '2018': 19} ad = ads[election_year] def parse(self, response): return response.follow(response.xpath(u'//a[img[@id="topmenu03"]]/@href').extract_first(), callback=self.parse_tab) def parse_tab(self, response): return response.follow(response.xpath(u'//a[re:test(., "^議員介紹$")]/@href').extract_first(), callback=self.parse_query) def parse_query(self, response): for link in response.css('.list.borderleft a::attr(href)'): yield response.follow(link, callback=self.parse_profile) def parse_profile(self, response): for node in response.xpath(u'(//td[re:test(., "提[\s ]*案")]/following-sibling::td)[1]/descendant::tr[position()>1]'): item = {} item['election_year'] = self.election_year item['id'] = '%s-%s-%s' % (node.xpath('td[3]/b/text()').extract_first(), node.xpath('td[1]/text()').extract_first(), node.xpath('td[2]/text()').extract_first(), ) item['abstract'] = node.xpath('td[3]/text()').extract_first() item['proposed_by'] = re.sub(u'(副?議長|議員)', '', response.xpath(u'//td[re:test(., "議員:")]')[-1].xpath('text()').extract_first().strip(u'議員:').strip()).strip().split(u'、') resolution = (node.xpath('td[4]/text()').extract_first() or '').strip() if resolution: item['motions'] = [dict(zip(['motion', 'resolution', 'date'], [u'決議', resolution, None]))] item['links'] = [ { 'url': response.url, 'note': 'original' } ] yield item
class Spider(scrapy.Spider): name = "meeting" allowed_domains = ["www.kmc.gov.tw", "ebook.21cms.tw"] start_urls = ["http://www.kmc.gov.tw/recorder",] download_delay = 0.5 county_abbr = os.path.dirname(os.path.realpath(__file__)).split('/')[-1] election_year = common.election_year(county_abbr) output_path = common.meeting_minutes_output_path(county_abbr, election_year) ads = { '2014': u'第十八屆', '2009': u'第十七屆' } ad = ads[election_year] def parse(self, response): nodes = response.css('.panel-body').xpath(u'descendant::a[re:test(., "%s")]' % self.ad) for node in nodes: link = node.xpath('@href').extract_first() item = {} item['election_year'] = self.election_year item['sitting'] = node.xpath('text()').extract_first().replace(u'(點擊閱讀)', '').replace('>>', '') item['download_url'] = urljoin(response.url, link) if re.search('/ebook/', link): file_name = '%s.pdf' % (item['sitting'], ) cmd = 'mkdir -p %s && wget -A pdf -nd -r --no-parent -O "%s%s" "%s"' % (self.output_path, self.output_path, file_name, urljoin(response.url, link)) retcode = subprocess.call(cmd, shell=True) yield item else: yield response.follow(link, callback=self.parse_iframe, meta={'item': item}) def parse_iframe(self, response): link = response.css('.article-content iframe').xpath('@src').extract_first() item = response.meta['item'] file_name = '%s.pdf' % (item['sitting'], ) cmd = 'mkdir -p %s && wget -A pdf -nd -r --no-parent -O "%s%s" "%s"' % (self.output_path, self.output_path, file_name, link) retcode = subprocess.call(cmd, shell=True) yield item
class Spider(scrapy.Spider): name = "bills" allowed_domains = [ "tncc.gov.tw", ] start_urls = ["http://www.tncc.gov.tw"] download_delay = 0.5 county_abbr = os.path.dirname(os.path.realpath(__file__)).split('/')[-1] election_year = common.election_year(county_abbr) ads = {'2010': 1, '2014': 2, '2018': 3} ad = ads[election_year] def parse(self, response): return response.follow( response.xpath(u'//a[re:test(., "^議案資訊$")]/@href').extract_first(), callback=self.parse_query) def parse_query(self, response): for value in response.xpath( u'//select[@name="motiondept"]/option[not(@value="")]/@value' ).extract(): payload = { 'menu1': response.xpath( u'//select[@name="menu1"]/option[re:test(., "第\s*%d\s*屆")]/@value' % self.ad).extract_first(), 'motiondept': value } yield scrapy.FormRequest.from_response(response, formdata=payload, callback=self.parse_list, dont_filter=True, dont_click=True, headers=common.headers( self.county_abbr)) def parse_list(self, response): for link in response.xpath( '//table[@id="printa"]/descendant::tr[count(td)>1]/descendant::a/@href' ): yield response.follow(link, callback=self.parse_profile) next_page = response.xpath( u'//a[re:test(., "^下一頁$")]/@href').extract_first() if next_page: yield response.follow(next_page, callback=self.parse_list) def parse_profile(self, response): item = {} item['election_year'] = self.election_year item['id'] = re.search('=([^&]+)', response.url).group(1) for key, label in [('type', u'提案類別'), ('category', u'審查會別'), ('abstract', u'主旨'), ('description', u'說明'), ('methods', u'辦法'), ('execution', u'辦理情形')]: content = response.xpath( u'string((//*[re:test(., "^%s$")]/following-sibling::td)[1])' % label).extract_first() if content: item[key] = content.strip() item['proposed_by'] = re.split( u'[\s、,.]', re.sub( u'(副?議長|議員)', '', response.xpath( u'(//*[re:test(., "^提案單位/人$")]/following-sibling::td)[1]/text()' ).extract_first()).strip()) item['petitioned_by'] = re.split( u'[\s、,.]', re.sub(u'(副?議長|議員)', '', (response.xpath( u'(//*[re:test(., "^連署人$")]/following-sibling::td)[1]/text()'). extract_first() or '')).strip()) item['motions'] = [] for date, motion in [ (u'來文日期', u'來文字號'), (None, u'審查意見'), (u'決議日期', u'大會決議'), (u'發文日期', u'發文字號'), ]: date = response.xpath( u'(//*[re:test(., "%s")]/following-sibling::td)[1]/text()' % u'[\s ]*'.join(date)).extract_first() if date else None resolution = response.xpath( u'(//*[re:test(., "%s")]/following-sibling::td)[1]/text()' % u'[\s ]*'.join(motion)).extract_first() if resolution: item['motions'].append( dict( zip(['motion', 'resolution', 'date'], [motion, resolution.strip(), date]))) item['links'] = [{'url': response.url, 'note': 'original'}] for link in response.xpath( u'(//*[re:test(., "^議會附件")]/following-sibling::td)[1]/descendant::a/@href' ).extract(): item['links'].append({ 'url': urljoin(response.url, link), 'note': 'attach' }) yield item
class Spider(scrapy.Spider): name = "bills" allowed_domains = ["www.hcc.gov.tw"] start_urls = [ "https://www.hcc.gov.tw/", ] download_delay = 1 county_abbr = os.path.dirname(os.path.realpath(__file__)).split('/')[-1] election_year = common.election_year(county_abbr) election_year_map = { "2005": { "start": datetime(2006, 3, 1), "end": datetime(2010, 3, 1) }, "2009": { "start": datetime(2010, 3, 1), "end": datetime(2014, 12, 25) }, "2014": { "start": datetime(2014, 12, 25), "end": datetime(2018, 12, 25), }, "2018": { "start": datetime(2018, 12, 25), "end": datetime(2022, 12, 25), }, } term_range = election_year_map[election_year] def parse(self, response): for t in [u'議員提案', u'縣府提案']: yield response.follow(re.sub( '^\.\.', '', response.xpath(u'//a[@title="%s"]/@href' % t).extract_first()), callback=self.parse_page, meta={'type': t}) def parse_page(self, response): bill_type = response.meta['type'] for node in response.css('.list--table ul:not(:first-child)'): date = datetime.strptime( node.css('.date-list::text').extract_first(), '%Y-%m-%d') if date < self.term_range['start']: raise scrapy.exceptions.CloseSpider('out of date range') if date > self.term_range['end']: # continue to next page break item = {} item['election_year'] = self.election_year item['type'] = bill_type item['category'] = node.css( u'[data-th*="類別:"]::text').extract_first() link = node.css('.more-list a::attr(href)').extract_first() item['id'] = 'gov-%s' % link.split('=')[-1].zfill( 6) if bill_type == u'縣府提案' else link.split('=')[-1].zfill(6) item['abstract'] = node.css( u'[data-th*="案由:"]::text').extract_first() item['proposed_by'] = ( node.css(u'[data-th*="提案人:"]::text').extract_first() or '').split() item['petitioned_by'] = ( node.css(u'[data-th*="聯署人:"]::text').extract_first() or '').split() yield response.follow( node.css('.more-list a::attr(href)').extract_first(), callback=self.parse_detail, meta={'item': item}) if response.css('a.pager.pager-next[href]').extract(): yield response.follow(response.css( 'a.pager.pager-next[href]::attr(href)').extract_first(), callback=self.parse_page, meta={'type': bill_type}) def parse_detail(self, response): item = response.meta['item'] item['links'] = [{'url': response.url, 'note': 'original'}] for link in response.css( '.list--none.actions a::attr(href)').extract(): item['links'].append({ 'url': urljoin(response.url, link), 'note': 'attach' }) return item
class Spider(scrapy.Spider): name = "bills" allowed_domains = ["ilcc.gov.tw", ] start_urls = ["http://www.ilcc.gov.tw"] download_delay = 0.5 county_abbr = os.path.dirname(os.path.realpath(__file__)).split('/')[-1] election_year = common.election_year(county_abbr) ads = {'1998': 14, '2002': 15, '2005': 16, '2009': 17, '2014': 18, '2018': 19} ad = ads[election_year] def parse(self, response): return response.follow(response.xpath(u'//frame[@name="leftFrame"]/@src').extract_first(), callback=self.parse_tab_frame) def parse_tab_frame(self, response): return response.follow(response.xpath(u'//a[@title="議案資料庫"]/@href').extract_first(), callback=self.parse_frame) def parse_frame(self, response): return response.follow(response.xpath(u'//frame[@name="mainFrame"]/@src').extract_first(), callback=self.parse_query) def parse_query(self, response): yield scrapy.FormRequest.from_response(response, callback=self.parse_list, dont_filter=True) def parse_list(self, response): for node in response.css('table#dg tr')[1:]: item = {} item['id'] = re.search(u'Fmotion_instanceOS=([^&]*)', node.xpath('td[1]/descendant::a/@href').extract_first()).group(1) yield response.follow(node.xpath('td[1]/descendant::a/@href').extract_first(), callback=self.parse_profile, meta={'item': item}) next_page = response.xpath(u'//a[re:test(.,"下一頁")]/@href').extract_first() has_next_page = response.xpath(u'//select[@name="page"]/option[@selected]/following-sibling::option').extract() if next_page and has_next_page: payload = {'__EVENTTARGET': re.search("doPostBack\('([^']*)'", next_page).group(1)} yield scrapy.FormRequest.from_response(response, formdata=payload, callback=self.parse_list, dont_filter=True, dont_click=True, headers=common.headers(self.county_abbr)) def parse_profile(self, response): item = response.meta['item'] item_ad = response.css(u'#lbFmotion_expireb::text').extract_first() for election_year, ad in self.ads.items(): if int(item_ad) == ad: item['election_year'] = election_year break if item['election_year'] != self.election_year: return for key, label in [('bill_id', u'lbFmotion_No'), ('type', u'lbFmotion_Category'), ('category', u'lbFmotion_Class'), ('abstract', u'lbFmotion_From'), ('description', u'lbFmotion_Reason'), ('methods', u'lbFmotion_Way')]: content = response.css(u'#%s::text' % label).extract_first() if content: item[key] = content.strip() item['proposed_by'] = re.split(u'\s|、', re.sub(u'(副?議長|議員)', '', response.css(u'#lbFmotion_People::text').extract_first()).strip()) item['petitioned_by'] = re.split(u'\s|、', re.sub(u'(副?議長|議員)', '', (response.css(u'#lbFmotion_AddTo::text').extract_first() or '')).strip()) item['motions'] = [] for motion, label in [(u'大會審議', 'lbFmotion_0'), (u'程序會審定', 'lbFmotion_v'), (u'大會決定', 'lbFmotion_1'), (u'分組審查', 'lbFmotion_g'), (u'大會決議', 'lbFmotion_2')]: date = response.css(u'#%sdate::text' % label).extract_first() resolution = response.css(u'#%sopinion::text' % label).extract_first() if date and resolution: item['motions'].append(dict(zip(['motion', 'resolution', 'date'], [motion, resolution.strip(), common.ROC2AD(date)]))) item['links'] = [ { 'url': response.url, 'note': 'original' } ] return item
class Spider(scrapy.Spider): name = "bills" allowed_domains = [ "kmcc.gov.tw", ] start_urls = ["http://www.kmcc.gov.tw"] download_delay = 3 county_abbr = os.path.dirname(os.path.realpath(__file__)).split('/')[-1] election_year = common.election_year(county_abbr) ads = {'2009': 5, '2014': 6, '2018': 7} ad = ads[election_year] def parse(self, response): return response.follow(response.xpath( u'//a[re:test(., "^議事錄查詢$")]/@href').extract_first(), callback=self.parse_query) def parse_query(self, response): for bill_type in response.xpath( u'//select[@name="Type"]/option[re:test(., "(提案|請願)")]/@value' ).extract(): for council in response.xpath( u'//select[@name="Council"]/option/@value').extract(): payload = {'Type': bill_type, 'Council': council} yield scrapy.FormRequest.from_response( response, formdata=payload, callback=self.parse_list, dont_filter=True, headers=common.headers(self.county_abbr)) def parse_list(self, response): for link in response.css( '.GridItem a::attr(href),.GridAlternatingItem a::attr(href)' ).extract(): time.sleep(60) yield response.follow(link, callback=self.parse_profile) next_page = response.css( u'.GridPager span ~ a::attr(href)').extract_first() if next_page: payload = { '__EVENTTARGET': re.search("doPostBack\('([^']*)'", next_page).group(1) } yield scrapy.FormRequest.from_response(response, formdata=payload, callback=self.parse_list, dont_filter=True, dont_click=True, headers=common.headers( self.county_abbr)) def parse_profile(self, response): item = {} item['election_year'] = self.election_year item['id'] = re.search('=([^&]*)', response.url).group(1).zfill(6) for key, label in [('type', u'Type'), ('category', u'Kind'), ('abstract', u'CasePoint'), ('description', u'CaseExplain'), ('methods', u'CaseMethod')]: content = response.css(u'#%s::text' % label).extract_first() if content: item[key] = content.strip() item['proposed_by'] = re.sub( u'(副?議長|議員)', '', response.css(u'#CaseUnit::text').extract_first()).strip().split( u'、') item['motions'] = [] for motion, label in [(u'審查意見', 'CaseOpinion'), (u'大會決議', 'Resolution')]: resolution = response.css(u'#%s::text' % label).extract_first() if resolution: item['motions'].append( dict( zip(['motion', 'resolution', 'date'], [motion, resolution.strip(), None]))) item['links'] = [{'url': response.url, 'note': 'original'}] return item
class Spider(scrapy.Spider): name = "bills" allowed_domains = ["www.tcc.gov.tw", "tccmis.tcc.gov.tw"] start_urls = ["http://www.tcc.gov.tw"] download_delay = 0.5 county_abbr = os.path.dirname(os.path.realpath(__file__)).split('/')[-1] election_year = common.election_year(county_abbr) ads = { '1969': 1, '1973': 2, '1977': 3, '1981': 4, '1985': 5, '1989': 6, '1994': 7, '1998': 8, '2002': 9, '2006': 10, '2010': 11, '2014': 12, '2018': 13 } ad = ads[election_year] def parse(self, response): return response.follow(response.xpath( u'//a[re:test(., "^議事資訊系統$")]/@href').extract_first(), callback=self.parse_frame) def parse_frame(self, response): return response.follow( response.xpath('//frame[@name="Search"]/@src').extract_first(), callback=self.parse_form) def parse_form(self, response): return scrapy.FormRequest.from_response(response, formname='OMForm', formdata={ 'OmasDetr': str(self.ad), 'rdoDE': '0' }, callback=self.parse_post) def parse_post(self, response): for node in response.xpath('//tr[@id="tr"]'): item = {} td = node.xpath('td/text()').extract() item['election_year'] = self.election_year item['id'] = td[1] item['bill_no'] = td[2] item['type'] = re.sub('\s', '', td[3]) item['category'] = td[4] yield scrapy.Request( "http://tccmis.tcc.gov.tw/OM/OM_SearchDetail.asp?sys_no=%s" % item['id'], callback=self.parse_profile, meta={'item': item}) def parse_profile(self, response): item = response.meta['item'] nodes = response.xpath('//div[@id="detail"]/table/tr') motions, committee_motion, council_motion = [], {}, {} for node in nodes: if node.xpath('td/text()')[0].re(u'目前處理程序'): item['last_action'] = node.xpath('td/text()').extract()[1] elif node.xpath('td/text()')[0].re(u'案由'): item['abstract'] = node.xpath('td/text()').extract()[1] elif node.xpath('td/text()')[0].re(u'提案人'): item['proposed_by'] = node.xpath( 'td/div/text()').extract()[0].strip().split(u'、') elif node.xpath('td/text()')[0].re(u'召集人/委員'): item['proposed_by'] = node.xpath( 'td/text()').extract()[1].strip().split(u'、') elif node.xpath('td/text()')[0].re(u'議決會次'): council_motion['motion'] = u'大會議決' council_motion['date'] = common.ROC2AD( node.xpath('td/text()').extract()[1].split()[0]) council_motion['sitting'] = ''.join( node.xpath('td/text()').extract()[1].split()[1:]) elif node.xpath('td/text()')[0].re(u'議決文'): council_motion['resolution'] = node.xpath( 'td/text()').extract()[1] elif node.xpath('td/text()')[0].re(u'案(\s| )+?號'): item['bill_no'] = node.xpath('td/text()').extract()[1].strip() elif node.xpath('td/text()')[0].re(u'來文文號'): td = node.xpath('td/text()').extract()[1].split() d = dict( zip(['motion', 'resolution', 'date'], [u'來文', None, common.ROC2AD(td[0])])) if len(td) > 1: d['no'] = td[1] motions.append(d) elif node.xpath('td/text()')[0].re(u'收文日期'): motions.append( dict( zip(['motion', 'resolution', 'date'], [ u'收文', None, common.ROC2AD( node.xpath('td/text()').extract()[1]) ]))) elif node.xpath('td/text()')[0].re(u'審查日期'): committee_motion['motion'] = u'委員會審查意見' committee_motion['date'] = common.ROC2AD( node.xpath('td/text()').extract()[1]) elif node.xpath('td/text()')[0].re(u'審查意見'): committee_motion['resolution'] = '\n'.join( node.xpath('td/text()').extract()[1:]) elif node.xpath('td/text()')[0].re(u'發文文號'): td = node.xpath('td/text()').extract()[1].split() d = dict( zip(['motion', 'resolution', 'date'], [u'發文', None, common.ROC2AD(td[0])])) if len(td) > 1: d['no'] = td[1] motions.append(d) elif node.xpath('td/text()')[0].re(u'執行情形'): item['execution'] = node.xpath('td/text()').extract()[1] elif node.xpath('td/text()')[0].re(u'備[\s]*?註'): item['remark'] = '\n'.join( node.xpath('td/text()').extract()[1:]) for motion in [committee_motion, council_motion]: if motion: motions.append(motion) item['motions'] = sorted(motions, key=lambda x: x.get('date'), reverse=True) item['links'] = [{'url': response.url, 'note': 'original'}] return item
class Spider(scrapy.Spider): name = "meeting" allowed_domains = ["www.ilcc.gov.tw"] start_urls = [ "http://www.ilcc.gov.tw/Html/H_06/H_06.asp", ] download_delay = 0.5 county_abbr = os.path.dirname(os.path.realpath(__file__)).split('/')[-1] election_year = common.election_year(county_abbr) output_path = common.meeting_minutes_output_path(county_abbr, election_year) payload = { 'ddlcounciltype': u'大會', } def parse(self, response): return response.follow( response.xpath(u'//area[@alt="會議紀錄"]/@href').extract_first(), callback=self.parse_frame) def parse_frame(self, response): return response.follow( response.xpath('//frame[@id="FrMain"]/@src').extract_first(), callback=self.parse_meeting_info) def parse_meeting_info(self, response): return scrapy.FormRequest.from_response(response, response.url, formdata=self.payload, callback=self.parse_pages) def parse_pages(self, response): pages = response.xpath( '//select[@name="page"]/option/@value').extract() for page in pages: yield scrapy.FormRequest.from_response(response, response.url, formdata={ 'page': page, 'btSearch': None }, callback=self.parse_post) def parse_post(self, response): trs = response.xpath('//table[@id="dg"]/descendant::tr[position()>1]') for tr in trs: item = {} item['election_year'] = self.election_year item['date'] = re.sub('\s', '', tr.xpath('string(td[1])').extract_first()) item['sitting'] = re.sub( '\s', '', '%s%s' % (tr.xpath('string(td[2])').extract_first(), tr.xpath('string(td[3])').extract_first())) item['meeting'] = re.sub('\s', '', tr.xpath('string(td[5])').extract_first()) yield response.follow( tr.xpath('td[4]/descendant::a/@href').extract_first(), callback=self.parse_profile, meta={'item': item}) def parse_profile(self, response): item = response.meta['item'] item['download_url'] = response.xpath( '//td/a[@target="_blank"]/@href').extract_first() if item['download_url']: ext = re.search(u'\.(\w+)$', item['download_url']).group(1) file_name = '%s_%s.%s' % (item['sitting'], item['meeting'], ext) cmd = 'mkdir -p %s && wget -c -O %s%s "%s"' % ( self.output_path, self.output_path, file_name, item['download_url']) retcode = subprocess.call(cmd, shell=True) else: logging.error(response.url) return item
class Spider(scrapy.Spider): name = "meeting" allowed_domains = ["www.ntp.gov.tw"] start_urls = [ 'https://www.ntp.gov.tw/content/information/information04.aspx' ] download_delay = 1 county_abbr = os.path.dirname(os.path.realpath(__file__)).split('/')[-1] election_year = common.election_year(county_abbr) output_path = common.meeting_minutes_output_path(county_abbr, election_year) def parse(self, response): for node in response.xpath( u'//a[contains(@title, "HTML檔")]/@href').extract(): yield response.follow(node, callback=self.parse_sitting) def parse_sitting(self, response): for node in response.xpath(u'//td/descendant::a/@href').extract(): yield response.follow(node, callback=self.parse_meeting) def parse_meeting(self, response): try: sitting = response.xpath('//text()').re(u'(.+)日程表')[0] trs = [ tr for tr in response.xpath('//table/descendant::tr') if tr.xpath('td[3]/text()').re('\d+') ] for tr in trs: item = {} item['election_year'] = self.election_year item['date'] = common.ROC2AD( tr.xpath('td[1]/text()').extract_first()) item['sitting'] = sitting item['meeting'] = tr.xpath('td[3]/text()').extract_first() item['download_url'] = tr.xpath( 'td[6]/descendant::a[1]/@href').extract_first() ext = item['download_url'].split('.')[-1] file_name = '%s_%s.%s' % (item['sitting'], item['meeting'], ext) if ext == 'pdf': yield response.follow(item['download_url'], callback=self.download_pdf, meta={ 'item': item, 'file_name': file_name }) elif ext == 'htm': yield response.follow(item['download_url'], callback=self.parse_html, meta={ 'item': item, 'file_name': file_name }) except scrapy.exceptions.NotSupported: pass def download_pdf(self, response): item = response.meta['item'] item['download_url'] = response.url cmd = 'mkdir -p %s && wget --no-check-certificate -c -O %s%s "%s"' % ( self.output_path, self.output_path, response.meta['file_name'], item['download_url']) retcode = subprocess.call(cmd, shell=True) return item def parse_html(self, response): item = response.meta['item'] item['download_url'] = response.url text = '\n'.join(response.xpath('//pre/text()').extract()) write_file( text, '%s%s_%s.txt' % (self.output_path, item['sitting'], item['meeting'])) return item
) ___ order by role, count desc ) ____ group by role order by sum desc ) _____ ) row )) )) where uid = %s ''', [uid, uid]) conn = db_settings.con() c = conn.cursor() election_year = common.election_year('') if len(argv): target_county = ast.literal_eval(argv[1])['county'] else: target_county = '*' for f in sorted(glob.glob('../../data/%s/bills-*.json' % target_county)): if int(re.search('bills-(\d+).json', f).group(1)) < int(election_year): continue print f county_abbr = f.split('/')[-2] f_election_year = re.search('-(\d+)\.json', f).group(1) county = common.county_abbr2string(county_abbr) county_abbr3 = common.county2abbr3(county) dict_list = json.load(open(f)) for bill in dict_list:
class Spider(scrapy.Spider): name = "bills" allowed_domains = [ "tpa.gov.tw", ] start_urls = ["http://ylcc.digital.tpa.gov.tw/"] download_delay = 0.5 county_abbr = os.path.dirname(os.path.realpath(__file__)).split('/')[-1] election_year = common.election_year(county_abbr) ads = { '2009': u'雲林縣.*?第(十七|17)屆', '2014': u'雲林縣.*?第(十八|18)屆', '2018': u'雲林縣.*?第(十九|19)屆' } ad = ads[election_year] def parse(self, response): return response.follow('/index.php?act=GuestLogin', callback=self.parse_login) def parse_login(self, response): return response.follow( response.xpath(u'//a[re:test(., "^提案$")]/@href').extract_first(), callback=self.parse_unordered, headers=common.headers(self.county_abbr)) def parse_unordered(self, response): payload = { 'act': 'search_set', 'field': 'SET_OrderByMethod', 'value': 'DESC' } yield scrapy.FormRequest(response.urljoin('application.php'), formdata=payload, callback=self.parse_reload, meta={'url': response.url}, headers=common.headers(self.county_abbr)) def parse_reload(self, response): return response.follow(response.meta['url'], callback=self.parse_query, dont_filter=True, headers=common.headers(self.county_abbr)) def parse_query(self, response): pages = re.sub( '\D', '', response.css('.result_select').xpath('string()').extract_first()) for node in response.css('.result_content'): link_node = node.css('.acc_link a') if link_node.xpath('text()').re(self.ad): item = {} item['election_year'] = self.election_year link = link_node.xpath('@href').extract_first() item['id'] = node.css('.acc_type::text').extract_first().split( '@')[0].strip() level = node.xpath( u'string((descendant::span[re:test(., "類別階層")]/following-sibling::span)[1])' ).extract_first() item['type'], item['category'] = re.search( u'/([^/]+)/?(.*)$', level).groups() item['abstract'] = re.sub( '\s', '', node.css('.result_text::text').extract_first()) yield response.follow(link, callback=self.parse_profile, meta={ 'item': item, 'handle_httpstatus_list': [302], 'dont_redirect': True }, headers=common.headers(self.county_abbr)) else: raise scrapy.exceptions.CloseSpider('out of date range') time.sleep(.5) next_page = response.css( '.page_botton.pb_pagedw::attr(href)').extract_first() if next_page: yield response.follow(next_page, callback=self.parse_query) def parse_profile(self, response): try: payload = { 'act': 'act_initial', 'target': re.search('=([^&]*)', response.headers['Location']).group(1), 'refer': 'serial' } except: print response.headers print response.body print response.status print response.url print 'profile:', response.urljoin(response.headers['Location']) raise scrapy.exceptions.CloseSpider('no redirect location') yield scrapy.FormRequest(response.urljoin( response.headers['Location']), formdata=payload, callback=self.parse_post, meta={'item': response.meta['item']}) def parse_post(self, response): item = response.meta['item'] try: jr = json.loads(response.body_as_unicode())['data']['meta'][0] except: print 'no json response:', response.url raise scrapy.exceptions.CloseSpider('no json response') item['proposed_by'] = re.sub( u'(副?議長|議員)', '', jr.get('Member') or jr.get('Organ') or jr.get('OrganPetiti') or jr.get('Chairman') or jr.get('Council')).strip().split(u',') if not item['proposed_by'][0]: print jr raise scrapy.exceptions.CloseSpider('empty proposed_by') item['petitioned_by'] = re.sub( u'(副?議長|議員)', '', (jr.get('MemberRelated') or jr.get('OrganPetiti') or '')).strip().split(u',') item['links'] = [{'url': response.url, 'note': 'original'}] return item
class Spider(scrapy.Spider): name = "councilors" allowed_domains = ["www.kmc.gov.tw"] start_urls = [ "http://www.kmc.gov.tw/", ] download_delay = 0.5 county_abbr = os.path.dirname(os.path.realpath(__file__)).split('/')[-1] election_year = common.election_year(county_abbr) county = common.county_abbr2string(county_abbr) def __init__(self): with open(os.path.join(os.path.dirname(__file__), 'constituency.json'), 'r') as infile: self.constituency = json.loads(infile.read()) with open( os.path.join(os.path.dirname(__file__), '../../data/cand-moi-county-control-2018.json'), 'r') as infile: self.ref = { re.sub(u'[\s ]', '', person['idname']): person for person in json.loads(infile.read()) if person['cityname'] == u'基隆市' } def parse(self, response): return response.follow( response.xpath(u'//a[re:test(., "^議員資訊$")]/@href').extract_first(), callback=self.parse_list) def parse_list(self, response): for link in response.css('#speaker a::attr(href)'): yield response.follow(link, callback=self.parse_profile) def parse_profile(self, response): item = {} item['election_year'] = self.election_year item['county'] = self.county item['in_office'] = True item['term_start'] = '%s-12-25' % item['election_year'] item['term_end'] = {'date': '2018-12-24'} print response.xpath( u'//p/span[re:test(., "\s+副?議(員|長)")]/text()').extract_first() item['name'], item['title'] = response.xpath( u'//p/span[re:test(., "\s+副?議(員|長)")]/text()').extract_first( ).split() item['gender'] = self.ref[item['name']]['sex'] item['constituency'] = response.xpath('//td/text()').re( u'選區:\s*(.+)')[0].strip() item['district'] = self.constituency[item['constituency']] item['image'] = urljoin( response.url, response.xpath(u'//p/img/@src').extract_first()) item['links'] = [{'url': response.url, 'note': u'議會個人官網'}] item['party'] = response.xpath('//td/text()').re( u'政黨:\s*(.+)')[0].strip() item['birth'] = common.ROC2AD( response.xpath('//td/text()').re(u'出生日期:\s*(.+)')[0]) website = response.xpath('//td/text()').re(u'網站連結:\s*(.+)') if website: item['links'].append({'url': website[0].strip(), 'note': u'個人網站'}) item['contact_details'] = [] contact_mappings = { u'連絡電話': 'voice', u'傳真號碼': 'fax', u'服務處': 'address', u'電子郵件': 'email' } for label, name in contact_mappings.items(): values = [ x.strip() for x in response.xpath(u'//td[re:test(., "%s:")]/text()' % '\s*'.join(label)).re(u'%s:\s*(.+)\s*' % label) if x.strip() ] for value in values: item['contact_details'].append({ 'label': label, 'type': name, 'value': value }) item['experience'] = [ x.strip() for x in response.xpath(u'//img[contains(@src, "speaker0")]') [1].xpath('ancestor::tr/following-sibling::tr[1]//tr/td[1]/text()' ).extract() if x.strip() ] item['platform'] = [ x.strip() for x in response.xpath(u'//img[contains(@src, "speaker0")]') [2].xpath('ancestor::tr/following-sibling::tr[1]//tr/td[1]/text()' ).extract() if x.strip() ] yield item
class Spider(scrapy.Spider): name = "bills" handle_httpstatus_list = [302] allowed_domains = ["kcc.gov.tw"] start_urls = ["http://www.kcc.gov.tw",] download_delay = 0.5 county_abbr = os.path.dirname(os.path.realpath(__file__)).split('/')[-1] election_year = common.election_year(county_abbr) ads = {'2010': u'一', '2014': u'二', '2018': u'三'} ad = ads[election_year] def parse(self, response): return response.follow(response.xpath(u'//a[re:test(., "^大會提案$")]/@href').extract_first(), callback=self.parse_query) def parse_query(self, response): payload = { 'ctl00$ContentPlaceHolder1$uscPeriodSessionMeeting$ddlSession': response.xpath(u'//select[@name="ctl00$ContentPlaceHolder1$uscPeriodSessionMeeting$ddlSession"]/option[re:test(., "%s屆")]/@value' % self.ad).extract_first(), 'ctl00$ContentPlaceHolder1$uscPeriodSessionMeeting$ddlMeeting': '', '__EVENTTARGET': re.search("__doPostBack\('([^']*)", response.css('#ContentPlaceHolder1_LinkButton1::attr(href)').extract_first()).group(1) } yield scrapy.FormRequest.from_response(response, formdata=payload, callback=self.parse_type, dont_filter=True, dont_click=True, headers=common.headers(self.county_abbr)) def parse_type(self, response): tabs = response.xpath('//div[@id="tabs"]/ul/li/a') for i, tab in enumerate(tabs, 1): type, count = tab.xpath('text()').extract() count = re.sub('\D', '', count) if count: payload = {"ctl00$ContentPlaceHolder1$DataPager%d$ctl02$txtPageSize" % i: count} if i != 1: payload["ctl00$ContentPlaceHolder1$btnGo%d" % i] = " Go " else: payload["ctl00$ContentPlaceHolder1$btnGo"] = " Go " yield scrapy.FormRequest.from_response(response, formdata=payload, callback=self.parse_tab, dont_filter=True, meta={'type': tab.xpath('text()').extract_first().strip(), 'tab_id': 'tabs-%d' % i}) def parse_tab(self, response): trs = response.xpath('//div[@id="%s"]/div/table/tr[count(td)>1]' % response.meta['tab_id']) for tr in trs: item = {} item['election_year'] = self.election_year item['type'] = response.meta['type'] item['last_action'] = tr.xpath('td[6]/text()').extract_first() link = tr.xpath('td[@onclick]/@onclick').re(u"\.href='([^']+)'")[0] yield response.follow(link, callback=self.parse_profile, meta={'dont_redirect': True, 'item': item}) def parse_profile(self, response): item = response.meta['item'] item['id'] = '-'.join(re.findall(u'=([^&]*)', response.url)) for key, label in [('category', u'類別'), ('abstract', u'案由'), ('description', u'說明'), ('methods', u'辦法'), ('remark', u'備註'), ]: content = response.xpath(u'string((//td[re:test(., "%s")]/following-sibling::td)[1])' % label).extract_first() if content: item[key] = content.strip() item['proposed_by'] = re.split(u'\s|、', re.sub(u'(副?議長|議員)', '', u'、'.join([x.strip() for x in response.xpath(u'(//td[re:test(., "提案(人|單位)")]/following-sibling::td)[1]/text()').extract()]))) item['petitioned_by'] = re.split(u'\s|、', re.sub(u'(副?議長|議員)', '', u'、'.join([x.strip() for x in (response.xpath(u'(//td[re:test(., "連署人")]/following-sibling::td)[1]/text()').extract() or [])]))) item['motions'] = [] for motion in [u'一讀', u'委員會審查意見', u'二讀決議', u'三讀決議', ]: date = common.ROC2AD(''.join(response.xpath(u'(//td[re:test(., "%s")]/following-sibling::td)[1]/span/text()' % motion).extract())) resolution = ''.join([x.strip() for x in response.xpath(u'(//td[re:test(., "%s")]/following-sibling::td)[1]/text()' % motion).extract()]) if date or resolution: item['motions'].append(dict(zip(['motion', 'resolution', 'date'], [motion, resolution, date]))) item['links'] = [ { 'url': response.url, 'note': 'original' } ] return item
class Spider(scrapy.Spider): name = "bills" allowed_domains = ["cyscc.gov.tw", ] start_urls = ["http://www.cyscc.gov.tw"] download_delay = 0.5 county_abbr = os.path.dirname(os.path.realpath(__file__)).split('/')[-1] election_year = common.election_year(county_abbr) ads = {'2009': 17, '2014': 18, '2018': 19} ad = ads[election_year] def parse(self, response): return response.follow(response.xpath(u'//a[re:test(., "^議會資料庫$")]/@href').extract_first(), callback=self.parse_tab) def parse_tab(self, response): return response.follow(response.xpath(u'//a[re:test(., "^議會資料庫查詢系統$")]/@href').extract_first(), callback=self.parse_query) def parse_query(self, response): for value in response.xpath(u'//input[@name="ctl00$ContentPlaceHolder1$rbtnMKind"]/@value').extract(): payload = {'ctl00$ContentPlaceHolder1$rbtnMKind': value} yield scrapy.FormRequest.from_response(response, formdata=payload, callback=self.parse_list, dont_filter=True, dont_click=True, headers=common.headers(self.county_abbr)) def parse_list(self, response): pages = response.css('#ctl00_ContentPlaceHolder1_gvIndex_ctl13_lblPageCount::text').extract_first() print pages for node in response.css('.main3_3_04,.main3_3_05'): node_ad = int(node.xpath('td[2]/text()').re(u'(\d+)\s*屆')[0]) if node_ad < self.ad: break if node_ad > self.ad: continue yield response.follow(node.xpath('td[6]/span/a/@href').extract_first(), callback=self.parse_profile) next_page = response.xpath(u'//a[re:test(.,"下一頁")]/@href').extract_first() if next_page and node_ad >= self.ad: payload = {'__EVENTTARGET': re.search("doPostBack\('([^']*)'", next_page).group(1)} yield scrapy.FormRequest.from_response(response, formdata=payload, callback=self.parse_list, dont_filter=True, dont_click=True, headers=common.headers(self.county_abbr)) def parse_profile(self, response): item = {} item['election_year'] = self.election_year item['id'] = re.search('=([^&]*)', response.url).group(1).zfill(6) for key, label in [('type', u'提案類別'), ('category', u'類別'), ('abstract', u'案由'), ('description', u'說明'), ('methods', u'辦法')]: content = response.xpath(u'(//td[re:test(., "^%s$")]/following-sibling::td)[1]/span/text()' % label).extract_first() if content: item[key] = content.strip() item['proposed_by'] = re.sub(u'(副?議長|議員)', '', response.xpath(u'(//td[re:test(., "^提\s*案\s*人$")]/following-sibling::td)[1]/span/text()').extract_first()).strip().split(u'、') item['petitioned_by'] = re.sub(u'(副?議長|議員)', '', (response.xpath(u'(//td[re:test(., "^連\s*署\s*人$")]/following-sibling::td)[1]/span/text()').extract_first() or '')).strip().split(u'、') item['motions'] = [] for motion in [u'審查意見', u'大會決議']: resolution = response.xpath(u'(//td[re:test(., "^%s$")]/following-sibling::td)[1]/span/text()' % motion).extract_first() if resolution: item['motions'].append(dict(zip(['motion', 'resolution', 'date'], [motion, resolution.strip(), None]))) item['links'] = [ { 'url': response.url, 'note': 'original' } ] for link in response.css('#ctl00_ContentPlaceHolder1_fvDetail_dlRelFile a::attr(href)').extract(): item['links'].append( { 'url': urljoin(response.url, link), 'note': 'attach' } ) return item
class Spider(scrapy.Spider): name = "bills" allowed_domains = [ "cycc.gov.tw", ] start_urls = ["http://www.cycc.gov.tw/index2.asp"] download_delay = 0.5 county_abbr = os.path.dirname(os.path.realpath(__file__)).split('/')[-1] election_year = common.election_year(county_abbr) ads = {'2009': 17, '2014': 18, '2018': 19} ad = ads[election_year] def parse(self, response): return response.follow( response.xpath(u'//a[img[@alt="議案查詢"]]/@href').extract_first(), callback=self.parse_tab) def parse_tab(self, response): return response.follow(response.xpath( u'//a[re:test(., "^議決案檢索$")]/@href').extract_first(), callback=self.parse_query) def parse_query(self, response): for value in response.xpath( u'//select[@name="sid"]/option/@value').extract(): payload = {'sid': value} yield scrapy.FormRequest.from_response(response, formdata=payload, callback=self.parse_list, dont_filter=True, dont_click=True, headers=common.headers( self.county_abbr)) def parse_list(self, response): for i, node in enumerate( response.xpath('//table[@bgcolor][not(caption)]')): item = {} item['election_year'] = self.election_year item['id'] = '%s-%02d' % ('-'.join( re.sub('\D', ' ', response.url).split()), i) for key, label in [('category', u'類[\s ]*別'), ('abstract', u'案[\s ]*由'), ('description', u'理[\s ]*由'), ('methods', u'辦[\s ]*法')]: content = response.xpath( u'(//*[re:test(., "%s")]/following-sibling::td)[1]/span/text()' % label).extract_first() if content: item[key] = content.strip() item['proposed_by'] = re.sub( u'(副?議長|議員)', '', response.xpath( u'(//*[re:test(., "提[\s ]*案[\s ]*人")]/following-sibling::td)[1]/span/text()' ).extract_first()).strip().split(u'、') item['petitioned_by'] = re.sub(u'(副?議長|議員)', '', (response.xpath( u'(//*[re:test(., "連[\s ]*署[\s ]*人")]/following-sibling::td)[1]/span/text()' ).extract_first() or '')).strip().split(u'、') item['motions'] = [] for motion in [u'審查意見', u'決議']: resolution = response.xpath( u'(//*[re:test(., "%s")]/following-sibling::td)[1]/span/text()' % u'[\s ]*'.join(motion)).extract_first() if resolution: item['motions'].append( dict( zip(['motion', 'resolution', 'date'], [motion, resolution.strip(), None]))) item['links'] = [{'url': response.url, 'note': 'original'}] yield item next_page = response.xpath( u'//a[img[@alt="下一頁"]]/@href').extract_first() if next_page: yield response.follow(next_page, callback=self.parse_list)
class Spider(scrapy.Spider): name = "meeting" allowed_domains = ["obas_front.tcc.gov.tw"] start_urls = [ "http://obas_front.tcc.gov.tw:8080/Agenda/EFileSearch.aspx?FileGrpKind=2&h=600", ] download_delay = 0.5 county_abbr = os.path.dirname(os.path.realpath(__file__)).split('/')[-1] election_year = common.election_year(county_abbr) output_path = common.meeting_minutes_output_path(county_abbr, election_year) payload = { 'btnCongress': u'大會', 'txtPageSize': u'300', } def parse(self, response): return scrapy.FormRequest.from_response(response, response.url, formdata=self.payload, callback=self.parse_post) def parse_post(self, response): links = response.xpath( '//table/tr/td/a[contains(@href, "EFileDetail.aspx")]/@href' ).extract() for link in links: yield response.follow(link, callback=self.parse_profile) def parse_profile(self, response): item = {} item['election_year'] = self.election_year nodes = response.xpath('//table/tbody/tr') ref = { u'屆別': { 'key': 'sitting', 'path': 'td/span/text()' }, u'類別': { 'key': 'category', 'path': 'td/span/text()' }, u'日期': { 'key': 'date', 'path': 'td/span/text()' }, u'資料名稱': { 'key': 'meeting', 'path': 'td/span/text()' }, u'檔案': { 'key': 'download_url', 'path': 'td/a/@href', 'extra': 'http://obas_front.tcc.gov.tw:8080/Agenda/' }, } for node in nodes: value = ref.get(node.xpath('th/text()').extract_first().strip()) if value: item[value['key']] = '%s%s' % (value.get( 'extra', ''), node.xpath(value['path']).extract_first()) item['date'] = common.ROC2AD(item['date']) ext = re.search(u'FileName=[\w\d]+\.(\w+)&', item['download_url']).group(1) file_name = '%s_%s.%s' % (item['sitting'], item['meeting'], ext) cmd = 'mkdir -p %s && wget -c -O %s%s "%s"' % ( self.output_path, self.output_path, file_name, item['download_url']) retcode = subprocess.call(cmd, shell=True) return item
2: '1973', 3: '1977', 4: '1981', 5: '1985', 6: '1989', 7: '1994', 8: '1998', 9: '2002', 10: '2006', 11: '2010', 12: '2014', 13: '2018' } county_abbr = os.path.dirname(os.path.realpath(__file__)).split('/')[-1] county = common.county_abbr2string(county_abbr) election_year = common.election_year(county) county_abbr3 = common.county2abbr3(county) total_text = codecs.open( u"../../../data/tcc/meeting_minutes-%s.txt" % election_year, "r", "utf-8").read() Session_Token = re.compile( u''' \s* (?P<name> %s議會 第\s*(?P<ad>[\d]+)\s*屆 第\s*(?P<session>[\d]+)\s*次(?P<type>(定期|臨時))大會 (預備會議暨)? 第\s*(?P<times>[\d]+)\s*次 會議
class Spider(scrapy.Spider): name = "bills" allowed_domains = [ "phcouncil.gov.tw", ] start_urls = ["http://www.phcouncil.gov.tw/"] download_delay = 1 county_abbr = os.path.dirname(os.path.realpath(__file__)).split('/')[-1] election_year = common.election_year(county_abbr) ads = {'2009': 17, '2014': 18, '2018': 19} ad = ads[election_year] def parse(self, response): for node in response.xpath( u'((//a[re:test(., "^議會相關法案$")]/following-sibling::ul)[1]/descendant::*[re:test(., "第\s*%s\s*屆")]/following-sibling::ul)[1]/descendant::a' % self.ad): yield response.follow( node.xpath('@href').extract_first(), callback=self.parse_query, meta={'type': node.xpath('text()').extract_first()}) def parse_query(self, response): for link in response.css(u'#Main_Table').xpath( u'descendant::a[re:test(., "詳細內容")]/@href').extract(): yield response.follow(link, callback=self.parse_list, meta={'type': response.meta['type']}) next_page = response.xpath( u'//a[img[@alt="下一頁"]]/@href').extract_first() if next_page: yield response.follow(next_page, callback=self.parse_query, meta={'type': response.meta['type']}) def parse_list(self, response): for link in response.css(u'#Main_Table').xpath( u'descendant::a[re:test(., "詳細內容")]/@href').extract(): yield response.follow(link, callback=self.parse_profile, meta={'type': response.meta['type']}) next_page = response.xpath( u'//a[img[@alt="下一頁"]]/@href').extract_first() if next_page: yield response.follow(next_page, callback=self.parse_list, meta={'type': response.meta['type']}) def parse_profile(self, response): item = {} item['election_year'] = self.election_year item['type'] = response.meta['type'] item['id'] = '%s-%s' % (self.election_year, re.search(u'id=([^&]*)', response.url).group(1)) for key, label in [('category', u'類[\s ]*別'), ('abstract', u'案[\s ]*由'), ('description', u'說[\s ]*明'), ('methods', u'辦[\s ]*法'), ('execution', u'決[\s ]*議'), ('execution', u'議[\s ]*決')]: content = response.xpath( u'string((//*[re:test(., "%s")]/following-sibling::td)[1])' % label).extract_first() if content: item[key] = content.strip() if item['type'] == u'縣府提案': item['proposed_by'] = u'縣府' else: item['proposed_by'] = re.split( u'[,、 ]', re.sub( u'(副?議長|議員)', '', response.xpath( u'//*[re:test(., "(提[\s ]*案|動[\s ]*議|請[\s ]*願)[\s ]*人")]' )[-1].xpath('following-sibling::td[1]/text()'). extract_first()).strip()) item['petitioned_by'] = re.split( u'[,、 ]', re.sub( u'(副?議長|議員)', '', response.xpath( u'//td[re:test(., "(連[\s ]*署|附[\s ]*議)[\s ]*人")]') [-1].xpath('following-sibling::td[1]/text()').extract_first()). strip()) if len( response.xpath(u'//*[re:test(., "(連[\s ]*署|附[\s ]*議)[\s ]*人")]' )) > 0 else [''] item['links'] = [{'url': response.url, 'note': 'original'}] yield item