1: '1969', 2: '1973', 3: '1977', 4: '1981', 5: '1985', 6: '1989', 7: '1994', 8: '1998', 9: '2002', 10: '2006', 11: '2010', 12: '2014', 13: '2018' } county_abbr = os.path.dirname(os.path.realpath(__file__)).split('/')[-1] county = common.county_abbr2string(county_abbr) election_year = common.election_year(county) county_abbr3 = common.county2abbr3(county) total_text = codecs.open( u"../../../data/tcc/meeting_minutes-%s.txt" % election_year, "r", "utf-8").read() Session_Token = re.compile( u''' \s* (?P<name> %s議會 第\s*(?P<ad>[\d]+)\s*屆 第\s*(?P<session>[\d]+)\s*次(?P<type>(定期|臨時))大會 (預備會議暨)? 第\s*(?P<times>[\d]+)\s*次
class Spider(scrapy.Spider): name = "councilors" allowed_domains = ["www.kmc.gov.tw"] start_urls = [ "http://www.kmc.gov.tw/", ] download_delay = 0.5 county_abbr = os.path.dirname(os.path.realpath(__file__)).split('/')[-1] election_year = common.election_year(county_abbr) county = common.county_abbr2string(county_abbr) def __init__(self): with open(os.path.join(os.path.dirname(__file__), 'constituency.json'), 'r') as infile: self.constituency = json.loads(infile.read()) with open( os.path.join(os.path.dirname(__file__), '../../data/cand-moi-county-control-2018.json'), 'r') as infile: self.ref = { re.sub(u'[\s ]', '', person['idname']): person for person in json.loads(infile.read()) if person['cityname'] == u'基隆市' } def parse(self, response): return response.follow( response.xpath(u'//a[re:test(., "^議員資訊$")]/@href').extract_first(), callback=self.parse_list) def parse_list(self, response): for link in response.css('#speaker a::attr(href)'): yield response.follow(link, callback=self.parse_profile) def parse_profile(self, response): item = {} item['election_year'] = self.election_year item['county'] = self.county item['in_office'] = True item['term_start'] = '%s-12-25' % item['election_year'] item['term_end'] = {'date': '2018-12-24'} print response.xpath( u'//p/span[re:test(., "\s+副?議(員|長)")]/text()').extract_first() item['name'], item['title'] = response.xpath( u'//p/span[re:test(., "\s+副?議(員|長)")]/text()').extract_first( ).split() item['gender'] = self.ref[item['name']]['sex'] item['constituency'] = response.xpath('//td/text()').re( u'選區:\s*(.+)')[0].strip() item['district'] = self.constituency[item['constituency']] item['image'] = urljoin( response.url, response.xpath(u'//p/img/@src').extract_first()) item['links'] = [{'url': response.url, 'note': u'議會個人官網'}] item['party'] = response.xpath('//td/text()').re( u'政黨:\s*(.+)')[0].strip() item['birth'] = common.ROC2AD( response.xpath('//td/text()').re(u'出生日期:\s*(.+)')[0]) website = response.xpath('//td/text()').re(u'網站連結:\s*(.+)') if website: item['links'].append({'url': website[0].strip(), 'note': u'個人網站'}) item['contact_details'] = [] contact_mappings = { u'連絡電話': 'voice', u'傳真號碼': 'fax', u'服務處': 'address', u'電子郵件': 'email' } for label, name in contact_mappings.items(): values = [ x.strip() for x in response.xpath(u'//td[re:test(., "%s:")]/text()' % '\s*'.join(label)).re(u'%s:\s*(.+)\s*' % label) if x.strip() ] for value in values: item['contact_details'].append({ 'label': label, 'type': name, 'value': value }) item['experience'] = [ x.strip() for x in response.xpath(u'//img[contains(@src, "speaker0")]') [1].xpath('ancestor::tr/following-sibling::tr[1]//tr/td[1]/text()' ).extract() if x.strip() ] item['platform'] = [ x.strip() for x in response.xpath(u'//img[contains(@src, "speaker0")]') [2].xpath('ancestor::tr/following-sibling::tr[1]//tr/td[1]/text()' ).extract() if x.strip() ] yield item