def parse_profile(self, response): item = {} item['election_year'] = self.election_year item['county'] = self.county item['in_office'] = True item['term_start'] = '%s-12-25' % item['election_year'] item['term_end'] = {'date': '2018-12-24'} print response.xpath( u'//p/span[re:test(., "\s+副?議(員|長)")]/text()').extract_first() item['name'], item['title'] = response.xpath( u'//p/span[re:test(., "\s+副?議(員|長)")]/text()').extract_first( ).split() item['gender'] = self.ref[item['name']]['sex'] item['constituency'] = response.xpath('//td/text()').re( u'選區:\s*(.+)')[0].strip() item['district'] = self.constituency[item['constituency']] item['image'] = urljoin( response.url, response.xpath(u'//p/img/@src').extract_first()) item['links'] = [{'url': response.url, 'note': u'議會個人官網'}] item['party'] = response.xpath('//td/text()').re( u'政黨:\s*(.+)')[0].strip() item['birth'] = common.ROC2AD( response.xpath('//td/text()').re(u'出生日期:\s*(.+)')[0]) website = response.xpath('//td/text()').re(u'網站連結:\s*(.+)') if website: item['links'].append({'url': website[0].strip(), 'note': u'個人網站'}) item['contact_details'] = [] contact_mappings = { u'連絡電話': 'voice', u'傳真號碼': 'fax', u'服務處': 'address', u'電子郵件': 'email' } for label, name in contact_mappings.items(): values = [ x.strip() for x in response.xpath(u'//td[re:test(., "%s:")]/text()' % '\s*'.join(label)).re(u'%s:\s*(.+)\s*' % label) if x.strip() ] for value in values: item['contact_details'].append({ 'label': label, 'type': name, 'value': value }) item['experience'] = [ x.strip() for x in response.xpath(u'//img[contains(@src, "speaker0")]') [1].xpath('ancestor::tr/following-sibling::tr[1]//tr/td[1]/text()' ).extract() if x.strip() ] item['platform'] = [ x.strip() for x in response.xpath(u'//img[contains(@src, "speaker0")]') [2].xpath('ancestor::tr/following-sibling::tr[1]//tr/td[1]/text()' ).extract() if x.strip() ] yield item
def parse_meeting(self, response): try: sitting = response.xpath('//text()').re(u'(.+)日程表')[0] trs = [ tr for tr in response.xpath('//table/descendant::tr') if tr.xpath('td[3]/text()').re('\d+') ] for tr in trs: item = {} item['election_year'] = self.election_year item['date'] = common.ROC2AD( tr.xpath('td[1]/text()').extract_first()) item['sitting'] = sitting item['meeting'] = tr.xpath('td[3]/text()').extract_first() item['download_url'] = tr.xpath( 'td[6]/descendant::a[1]/@href').extract_first() ext = item['download_url'].split('.')[-1] file_name = '%s_%s.%s' % (item['sitting'], item['meeting'], ext) if ext == 'pdf': yield response.follow(item['download_url'], callback=self.download_pdf, meta={ 'item': item, 'file_name': file_name }) elif ext == 'htm': yield response.follow(item['download_url'], callback=self.parse_html, meta={ 'item': item, 'file_name': file_name }) except scrapy.exceptions.NotSupported: pass
def parse_list(self, response): for tr in response.css('#table2 tr'): link = tr.xpath(u'descendant::a[re:test(., "^第%d屆")]/@href' % self.ad).extract_first() if link: item = {} item['election_year'] = self.election_year item['date'] = common.ROC2AD(tr.xpath('td[1]/text()').extract_first()) item['meeting'] = tr.xpath('td[3]/descendant::a/text()').extract_first() item['meeting'] = item['meeting'].replace('.', u'、') item['download_url'] = urljoin(response.url, link) ext = item['download_url'].split('.')[-1] file_name = '%s.%s' % (item['meeting'], ext) cmd = 'mkdir -p %s && wget --no-check-certificate -c -O %s%s "%s"' % (self.output_path, self.output_path, file_name, item['download_url']) retcode = subprocess.call(cmd, shell=True) yield item
def parse_profile(self, response): item = {} item['election_year'] = self.election_year nodes = response.xpath('//table/tbody/tr') ref = { u'屆別': { 'key': 'sitting', 'path': 'td/span/text()' }, u'類別': { 'key': 'category', 'path': 'td/span/text()' }, u'日期': { 'key': 'date', 'path': 'td/span/text()' }, u'資料名稱': { 'key': 'meeting', 'path': 'td/span/text()' }, u'檔案': { 'key': 'download_url', 'path': 'td/a/@href', 'extra': 'http://obas_front.tcc.gov.tw:8080/Agenda/' }, } for node in nodes: value = ref.get(node.xpath('th/text()').extract_first().strip()) if value: item[value['key']] = '%s%s' % (value.get( 'extra', ''), node.xpath(value['path']).extract_first()) item['date'] = common.ROC2AD(item['date']) ext = re.search(u'FileName=[\w\d]+\.(\w+)&', item['download_url']).group(1) file_name = '%s_%s.%s' % (item['sitting'], item['meeting'], ext) cmd = 'mkdir -p %s && wget -c -O %s%s "%s"' % ( self.output_path, self.output_path, file_name, item['download_url']) retcode = subprocess.call(cmd, shell=True) return item
def parse_profile(self, response): item = response.meta['item'] item['id'] = '-'.join(re.findall(u'=([^&]*)', response.url)) for key, label in [('category', u'類別'), ('abstract', u'案由'), ('description', u'說明'), ('methods', u'辦法'), ('remark', u'備註'), ]: content = response.xpath(u'string((//td[re:test(., "%s")]/following-sibling::td)[1])' % label).extract_first() if content: item[key] = content.strip() item['proposed_by'] = re.split(u'\s|、', re.sub(u'(副?議長|議員)', '', u'、'.join([x.strip() for x in response.xpath(u'(//td[re:test(., "提案(人|單位)")]/following-sibling::td)[1]/text()').extract()]))) item['petitioned_by'] = re.split(u'\s|、', re.sub(u'(副?議長|議員)', '', u'、'.join([x.strip() for x in (response.xpath(u'(//td[re:test(., "連署人")]/following-sibling::td)[1]/text()').extract() or [])]))) item['motions'] = [] for motion in [u'一讀', u'委員會審查意見', u'二讀決議', u'三讀決議', ]: date = common.ROC2AD(''.join(response.xpath(u'(//td[re:test(., "%s")]/following-sibling::td)[1]/span/text()' % motion).extract())) resolution = ''.join([x.strip() for x in response.xpath(u'(//td[re:test(., "%s")]/following-sibling::td)[1]/text()' % motion).extract()]) if date or resolution: item['motions'].append(dict(zip(['motion', 'resolution', 'date'], [motion, resolution, date]))) item['links'] = [ { 'url': response.url, 'note': 'original' } ] return item
def parse_profile(self, response): trs = response.xpath('//table[@id="ContentPlaceHolder1_gvIndex"]/tr') for tr in trs: item = {} tds = tr.xpath('td') if tds: item['election_year'] = self.election_year item['date'] = common.ROC2AD( tds[1].xpath('text()').extract_first()) meeting = tds[2].xpath('text()').extract_first() item['meeting'] = tds[2].xpath('text()').re( u'(.+?)[紀記][錄錄]')[0] item['download_url'] = urljoin( response.url, tds[3].xpath('a/@href').extract_first().strip()) ext = item['download_url'].split('.')[-1] file_name = '%s.%s' % (item['meeting'], ext) cmd = 'mkdir -p %s && wget -c -O %s%s "%s"' % ( self.output_path, self.output_path, file_name, item['download_url']) retcode = subprocess.call(cmd, shell=True) time.sleep(1) yield item
if match: if match.group('type') == u'定期': uid = '%s-%s-%02d-CS-%02d' % (county_abbr3, election_years[int( match.group('ad'))], int( match.group('session')), int(match.group('times'))) elif match.group('type') == u'臨時': uid = '%s-%s-T%02d-CS-%02d' % (county_abbr3, election_years[int( match.group('ad'))], int( match.group('session')), int(match.group('times'))) sittings.append({ "uid": uid, "name": re.sub('\s', '', match.group('name')), "county": county, "election_year": election_year, "session": match.group('session'), "date": common.ROC2AD(total_text[match.end():]), "start": match.start(), "end": match.end() }) for i in range(0, len(sittings)): # --> sittings, attendance, filelog if i != len(sittings) - 1: one_sitting_text = total_text[ sittings[i]['start']:sittings[i + 1]['start']] else: one_sitting_text = total_text[sittings[i]['start']:] logging.info(sittings[i]['uid']) common.InsertSitting(c, sittings[i]) common.FileLog(c, sittings[i]['name']) present_match = Present_Token.search(one_sitting_text) if present_match:
Namelist_Token = re.compile(u''' ^.*? 具名表決[,,] (贊成|反對).*?者有(?P<dicision_a>.*)[,,]?共計\s*\d+位[;;] (贊成|反對).*?者有(?P<dicision_b>.*)[,,]?共計\s*\d+位[;;] 表決結果.*?$ ''', re.X | re.M) sittings = [] for match in Session_Token.finditer(total_text): if match: if match.group('type') == u'定期': uid = '%s-%s-%02d-CS-%02d' % (county_abbreviation, election_years[int(match.group('ad'))], int(match.group('session')), int(match.group('times'))) elif match.group('type') == u'臨時': uid = '%s-%s-T%02d-CS-%02d' % (county_abbreviation, election_years[int(match.group('ad'))], int(match.group('session')), int(match.group('times'))) sittings.append({"uid":uid, "name": re.sub('\s', '', match.group('name')), "county": county, "election_year": election_years[int(match.group('ad'))], "session": match.group('session'), "date": common.ROC2AD(total_text[match.end():]), "start": match.start(), "end": match.end()}) for i in range(0, len(sittings)): break # --> sittings, attendance, filelog if i != len(sittings)-1: one_sitting_text = total_text[sittings[i]['start']:sittings[i+1]['start']] else: one_sitting_text = total_text[sittings[i]['start']:] print sittings[i] common.InsertSitting(c, sittings[i]) common.FileLog(c, sittings[i]['name']) # absent absent_match = Absent_Token.search(one_sitting_text) exclude = [] if absent_match: names = re.sub(u'(副?議長|議員)', '', absent_match.group('names'))
worksheets = sh.worksheets() for wks in worksheets: rows = wks.get_all_records() position_type = 'mayors' county = wks.title.replace(u'台', u'臺') print county for row in rows: if not row[u'姓名']: continue candidate = {} candidate['type'] = position_type candidate['county'] = county candidate['constituency'] = 0 candidate['name'] = common.normalize_person_name(row[u'姓名']) candidate['election_year'] = election_year candidate['birth'] = common.ROC2AD(row[u'出生年月日']) candidate['party'] = common.normalize_party(row[u'政黨']) candidate['gender'] = row[u'性別'] candidate['number'] = row[u'號次'] candidate['education'] = row[u'學歷'] candidate['experience'] = row[u'經歷'] candidate['platform'] = row[u'政見'] candidate['image'] = "%s/mayors/%s/%s_%04d.jpg" % ( common.storage_domain(), candidate['election_year'], candidate['county'], candidate['number']) if position_type == 'mayors': candidate[ 'candidate_uid'], created = common.get_or_create_moyor_candidate_uid( c, candidate) else: candidate[
def parse_profile(self, response): item = response.meta['item'] nodes = response.xpath('//div[@id="detail"]/table/tr') motions, committee_motion, council_motion = [], {}, {} for node in nodes: if node.xpath('td/text()')[0].re(u'目前處理程序'): item['last_action'] = node.xpath('td/text()').extract()[1] elif node.xpath('td/text()')[0].re(u'案由'): item['abstract'] = node.xpath('td/text()').extract()[1] elif node.xpath('td/text()')[0].re(u'提案人'): item['proposed_by'] = node.xpath( 'td/div/text()').extract()[0].strip().split(u'、') elif node.xpath('td/text()')[0].re(u'召集人/委員'): item['proposed_by'] = node.xpath( 'td/text()').extract()[1].strip().split(u'、') elif node.xpath('td/text()')[0].re(u'議決會次'): council_motion['motion'] = u'大會議決' council_motion['date'] = common.ROC2AD( node.xpath('td/text()').extract()[1].split()[0]) council_motion['sitting'] = ''.join( node.xpath('td/text()').extract()[1].split()[1:]) elif node.xpath('td/text()')[0].re(u'議決文'): council_motion['resolution'] = node.xpath( 'td/text()').extract()[1] elif node.xpath('td/text()')[0].re(u'案(\s| )+?號'): item['bill_no'] = node.xpath('td/text()').extract()[1].strip() elif node.xpath('td/text()')[0].re(u'來文文號'): td = node.xpath('td/text()').extract()[1].split() d = dict( zip(['motion', 'resolution', 'date'], [u'來文', None, common.ROC2AD(td[0])])) if len(td) > 1: d['no'] = td[1] motions.append(d) elif node.xpath('td/text()')[0].re(u'收文日期'): motions.append( dict( zip(['motion', 'resolution', 'date'], [ u'收文', None, common.ROC2AD( node.xpath('td/text()').extract()[1]) ]))) elif node.xpath('td/text()')[0].re(u'審查日期'): committee_motion['motion'] = u'委員會審查意見' committee_motion['date'] = common.ROC2AD( node.xpath('td/text()').extract()[1]) elif node.xpath('td/text()')[0].re(u'審查意見'): committee_motion['resolution'] = '\n'.join( node.xpath('td/text()').extract()[1:]) elif node.xpath('td/text()')[0].re(u'發文文號'): td = node.xpath('td/text()').extract()[1].split() d = dict( zip(['motion', 'resolution', 'date'], [u'發文', None, common.ROC2AD(td[0])])) if len(td) > 1: d['no'] = td[1] motions.append(d) elif node.xpath('td/text()')[0].re(u'執行情形'): item['execution'] = node.xpath('td/text()').extract()[1] elif node.xpath('td/text()')[0].re(u'備[\s]*?註'): item['remark'] = '\n'.join( node.xpath('td/text()').extract()[1:]) for motion in [committee_motion, council_motion]: if motion: motions.append(motion) item['motions'] = sorted(motions, key=lambda x: x.get('date'), reverse=True) item['links'] = [{'url': response.url, 'note': 'original'}] return item
meetings = json.load( open('../../../data/kmc/meeting_minutes-%s.json' % election_year)) for meeting in meetings: total_text = unicodedata.normalize( 'NFC', codecs.open( '../../../data/kmc/meeting_minutes/%s/%s.txt' % (election_year, meeting['sitting']), "r", "utf-8").read()) total_text = re.sub(u'.', u'‧', total_text) total_text = re.sub(u' ', ' ', total_text) match = Session_Token.search(meeting['sitting']) if match: for i, session in enumerate(Present_Token.finditer(total_text), 1): meeting['date'] = common.ROC2AD( re.search(u'時\s*間[::](.*)', total_text[:session.start()].strip().split( '\n')[-1]).group(1)) if match.group('type') == u'臨時': uid = '%s-%s-T%s-CS-%02d' % (county_abbr3, election_year, match.group('session'), i) else: uid = '%s-%s-%s-CS-%02d' % (county_abbr3, election_year, match.group('session'), i) sitting = { "uid": uid, "name": u'%s議會%s第%d次會議' % (county, meeting['sitting'], i), "county": county, "election_year": election_year, "session": match.group('session'), "date": meeting['date'] }
def parse_profile(self, response): item = response.meta['item'] item_ad = response.css(u'#lbFmotion_expireb::text').extract_first() for election_year, ad in self.ads.items(): if int(item_ad) == ad: item['election_year'] = election_year break if item['election_year'] != self.election_year: return for key, label in [('bill_id', u'lbFmotion_No'), ('type', u'lbFmotion_Category'), ('category', u'lbFmotion_Class'), ('abstract', u'lbFmotion_From'), ('description', u'lbFmotion_Reason'), ('methods', u'lbFmotion_Way')]: content = response.css(u'#%s::text' % label).extract_first() if content: item[key] = content.strip() item['proposed_by'] = re.split(u'\s|、', re.sub(u'(副?議長|議員)', '', response.css(u'#lbFmotion_People::text').extract_first()).strip()) item['petitioned_by'] = re.split(u'\s|、', re.sub(u'(副?議長|議員)', '', (response.css(u'#lbFmotion_AddTo::text').extract_first() or '')).strip()) item['motions'] = [] for motion, label in [(u'大會審議', 'lbFmotion_0'), (u'程序會審定', 'lbFmotion_v'), (u'大會決定', 'lbFmotion_1'), (u'分組審查', 'lbFmotion_g'), (u'大會決議', 'lbFmotion_2')]: date = response.css(u'#%sdate::text' % label).extract_first() resolution = response.css(u'#%sopinion::text' % label).extract_first() if date and resolution: item['motions'].append(dict(zip(['motion', 'resolution', 'date'], [motion, resolution.strip(), common.ROC2AD(date)]))) item['links'] = [ { 'url': response.url, 'note': 'original' } ] return item