def parse_profile(self, response):
     item = {}
     item['election_year'] = self.election_year
     item['county'] = self.county
     item['in_office'] = True
     item['term_start'] = '%s-12-25' % item['election_year']
     item['term_end'] = {'date': '2018-12-24'}
     print response.xpath(
         u'//p/span[re:test(., "\s+副?議(員|長)")]/text()').extract_first()
     item['name'], item['title'] = response.xpath(
         u'//p/span[re:test(., "\s+副?議(員|長)")]/text()').extract_first(
         ).split()
     item['gender'] = self.ref[item['name']]['sex']
     item['constituency'] = response.xpath('//td/text()').re(
         u'選區:\s*(.+)')[0].strip()
     item['district'] = self.constituency[item['constituency']]
     item['image'] = urljoin(
         response.url,
         response.xpath(u'//p/img/@src').extract_first())
     item['links'] = [{'url': response.url, 'note': u'議會個人官網'}]
     item['party'] = response.xpath('//td/text()').re(
         u'政黨:\s*(.+)')[0].strip()
     item['birth'] = common.ROC2AD(
         response.xpath('//td/text()').re(u'出生日期:\s*(.+)')[0])
     website = response.xpath('//td/text()').re(u'網站連結:\s*(.+)')
     if website:
         item['links'].append({'url': website[0].strip(), 'note': u'個人網站'})
     item['contact_details'] = []
     contact_mappings = {
         u'連絡電話': 'voice',
         u'傳真號碼': 'fax',
         u'服務處': 'address',
         u'電子郵件': 'email'
     }
     for label, name in contact_mappings.items():
         values = [
             x.strip() for x in
             response.xpath(u'//td[re:test(., "%s:")]/text()' %
                            '\s*'.join(label)).re(u'%s:\s*(.+)\s*' % label)
             if x.strip()
         ]
         for value in values:
             item['contact_details'].append({
                 'label': label,
                 'type': name,
                 'value': value
             })
     item['experience'] = [
         x.strip()
         for x in response.xpath(u'//img[contains(@src, "speaker0")]')
         [1].xpath('ancestor::tr/following-sibling::tr[1]//tr/td[1]/text()'
                   ).extract() if x.strip()
     ]
     item['platform'] = [
         x.strip()
         for x in response.xpath(u'//img[contains(@src, "speaker0")]')
         [2].xpath('ancestor::tr/following-sibling::tr[1]//tr/td[1]/text()'
                   ).extract() if x.strip()
     ]
     yield item
 def parse_meeting(self, response):
     try:
         sitting = response.xpath('//text()').re(u'(.+)日程表')[0]
         trs = [
             tr for tr in response.xpath('//table/descendant::tr')
             if tr.xpath('td[3]/text()').re('\d+')
         ]
         for tr in trs:
             item = {}
             item['election_year'] = self.election_year
             item['date'] = common.ROC2AD(
                 tr.xpath('td[1]/text()').extract_first())
             item['sitting'] = sitting
             item['meeting'] = tr.xpath('td[3]/text()').extract_first()
             item['download_url'] = tr.xpath(
                 'td[6]/descendant::a[1]/@href').extract_first()
             ext = item['download_url'].split('.')[-1]
             file_name = '%s_%s.%s' % (item['sitting'], item['meeting'],
                                       ext)
             if ext == 'pdf':
                 yield response.follow(item['download_url'],
                                       callback=self.download_pdf,
                                       meta={
                                           'item': item,
                                           'file_name': file_name
                                       })
             elif ext == 'htm':
                 yield response.follow(item['download_url'],
                                       callback=self.parse_html,
                                       meta={
                                           'item': item,
                                           'file_name': file_name
                                       })
     except scrapy.exceptions.NotSupported:
         pass
Example #3
0
 def parse_list(self, response):
     for tr in response.css('#table2 tr'):
         link = tr.xpath(u'descendant::a[re:test(., "^第%d屆")]/@href' % self.ad).extract_first()
         if link:
             item = {}
             item['election_year'] = self.election_year
             item['date'] = common.ROC2AD(tr.xpath('td[1]/text()').extract_first())
             item['meeting'] = tr.xpath('td[3]/descendant::a/text()').extract_first()
             item['meeting'] = item['meeting'].replace('.', u'、')
             item['download_url'] = urljoin(response.url, link)
             ext = item['download_url'].split('.')[-1]
             file_name = '%s.%s' % (item['meeting'], ext)
             cmd = 'mkdir -p %s && wget --no-check-certificate -c -O %s%s "%s"' % (self.output_path, self.output_path, file_name, item['download_url'])
             retcode = subprocess.call(cmd, shell=True)
             yield item
Example #4
0
 def parse_profile(self, response):
     item = {}
     item['election_year'] = self.election_year
     nodes = response.xpath('//table/tbody/tr')
     ref = {
         u'屆別': {
             'key': 'sitting',
             'path': 'td/span/text()'
         },
         u'類別': {
             'key': 'category',
             'path': 'td/span/text()'
         },
         u'日期': {
             'key': 'date',
             'path': 'td/span/text()'
         },
         u'資料名稱': {
             'key': 'meeting',
             'path': 'td/span/text()'
         },
         u'檔案': {
             'key': 'download_url',
             'path': 'td/a/@href',
             'extra': 'http://obas_front.tcc.gov.tw:8080/Agenda/'
         },
     }
     for node in nodes:
         value = ref.get(node.xpath('th/text()').extract_first().strip())
         if value:
             item[value['key']] = '%s%s' % (value.get(
                 'extra', ''), node.xpath(value['path']).extract_first())
     item['date'] = common.ROC2AD(item['date'])
     ext = re.search(u'FileName=[\w\d]+\.(\w+)&',
                     item['download_url']).group(1)
     file_name = '%s_%s.%s' % (item['sitting'], item['meeting'], ext)
     cmd = 'mkdir -p %s && wget -c -O %s%s "%s"' % (
         self.output_path, self.output_path, file_name,
         item['download_url'])
     retcode = subprocess.call(cmd, shell=True)
     return item
 def parse_profile(self, response):
     item = response.meta['item']
     item['id'] = '-'.join(re.findall(u'=([^&]*)', response.url))
     for key, label in [('category', u'類別'), ('abstract', u'案由'), ('description', u'說明'), ('methods', u'辦法'), ('remark', u'備註'), ]:
         content = response.xpath(u'string((//td[re:test(., "%s")]/following-sibling::td)[1])' % label).extract_first()
         if content:
             item[key] = content.strip()
     item['proposed_by'] = re.split(u'\s|、', re.sub(u'(副?議長|議員)', '', u'、'.join([x.strip() for x in response.xpath(u'(//td[re:test(., "提案(人|單位)")]/following-sibling::td)[1]/text()').extract()])))
     item['petitioned_by'] = re.split(u'\s|、', re.sub(u'(副?議長|議員)', '', u'、'.join([x.strip() for x in (response.xpath(u'(//td[re:test(., "連署人")]/following-sibling::td)[1]/text()').extract() or [])])))
     item['motions'] = []
     for motion in [u'一讀', u'委員會審查意見', u'二讀決議', u'三讀決議', ]:
         date = common.ROC2AD(''.join(response.xpath(u'(//td[re:test(., "%s")]/following-sibling::td)[1]/span/text()' % motion).extract()))
         resolution = ''.join([x.strip() for x in response.xpath(u'(//td[re:test(., "%s")]/following-sibling::td)[1]/text()' % motion).extract()])
         if date or resolution:
             item['motions'].append(dict(zip(['motion', 'resolution', 'date'], [motion, resolution, date])))
     item['links'] = [
         {
             'url': response.url,
             'note': 'original'
         }
     ]
     return item
Example #6
0
 def parse_profile(self, response):
     trs = response.xpath('//table[@id="ContentPlaceHolder1_gvIndex"]/tr')
     for tr in trs:
         item = {}
         tds = tr.xpath('td')
         if tds:
             item['election_year'] = self.election_year
             item['date'] = common.ROC2AD(
                 tds[1].xpath('text()').extract_first())
             meeting = tds[2].xpath('text()').extract_first()
             item['meeting'] = tds[2].xpath('text()').re(
                 u'(.+?)[紀記][錄錄]')[0]
             item['download_url'] = urljoin(
                 response.url,
                 tds[3].xpath('a/@href').extract_first().strip())
             ext = item['download_url'].split('.')[-1]
             file_name = '%s.%s' % (item['meeting'], ext)
             cmd = 'mkdir -p %s && wget -c -O %s%s "%s"' % (
                 self.output_path, self.output_path, file_name,
                 item['download_url'])
             retcode = subprocess.call(cmd, shell=True)
             time.sleep(1)
             yield item
    if match:
        if match.group('type') == u'定期':
            uid = '%s-%s-%02d-CS-%02d' % (county_abbr3, election_years[int(
                match.group('ad'))], int(
                    match.group('session')), int(match.group('times')))
        elif match.group('type') == u'臨時':
            uid = '%s-%s-T%02d-CS-%02d' % (county_abbr3, election_years[int(
                match.group('ad'))], int(
                    match.group('session')), int(match.group('times')))
        sittings.append({
            "uid": uid,
            "name": re.sub('\s', '', match.group('name')),
            "county": county,
            "election_year": election_year,
            "session": match.group('session'),
            "date": common.ROC2AD(total_text[match.end():]),
            "start": match.start(),
            "end": match.end()
        })
for i in range(0, len(sittings)):
    # --> sittings, attendance, filelog
    if i != len(sittings) - 1:
        one_sitting_text = total_text[
            sittings[i]['start']:sittings[i + 1]['start']]
    else:
        one_sitting_text = total_text[sittings[i]['start']:]
    logging.info(sittings[i]['uid'])
    common.InsertSitting(c, sittings[i])
    common.FileLog(c, sittings[i]['name'])
    present_match = Present_Token.search(one_sitting_text)
    if present_match:
Namelist_Token = re.compile(u'''
    ^.*?
    具名表決[,,]
    (贊成|反對).*?者有(?P<dicision_a>.*)[,,]?共計\s*\d+位[;;]
    (贊成|反對).*?者有(?P<dicision_b>.*)[,,]?共計\s*\d+位[;;]
    表決結果.*?$
''', re.X | re.M)

sittings = []
for match in Session_Token.finditer(total_text):
    if match:
        if match.group('type') == u'定期':
            uid = '%s-%s-%02d-CS-%02d' % (county_abbreviation, election_years[int(match.group('ad'))], int(match.group('session')), int(match.group('times')))
        elif match.group('type') == u'臨時':
            uid = '%s-%s-T%02d-CS-%02d' % (county_abbreviation, election_years[int(match.group('ad'))], int(match.group('session')), int(match.group('times')))
        sittings.append({"uid":uid, "name": re.sub('\s', '', match.group('name')), "county": county, "election_year": election_years[int(match.group('ad'))], "session": match.group('session'), "date": common.ROC2AD(total_text[match.end():]), "start": match.start(), "end": match.end()})
for i in range(0, len(sittings)):
    break
    # --> sittings, attendance, filelog
    if i != len(sittings)-1:
        one_sitting_text = total_text[sittings[i]['start']:sittings[i+1]['start']]
    else:
        one_sitting_text = total_text[sittings[i]['start']:]
    print sittings[i]
    common.InsertSitting(c, sittings[i])
    common.FileLog(c, sittings[i]['name'])
    # absent
    absent_match = Absent_Token.search(one_sitting_text)
    exclude = []
    if absent_match:
        names = re.sub(u'(副?議長|議員)', '', absent_match.group('names'))
Example #9
0
worksheets = sh.worksheets()
for wks in worksheets:
    rows = wks.get_all_records()
    position_type = 'mayors'
    county = wks.title.replace(u'台', u'臺')
    print county
    for row in rows:
        if not row[u'姓名']:
            continue
        candidate = {}
        candidate['type'] = position_type
        candidate['county'] = county
        candidate['constituency'] = 0
        candidate['name'] = common.normalize_person_name(row[u'姓名'])
        candidate['election_year'] = election_year
        candidate['birth'] = common.ROC2AD(row[u'出生年月日'])
        candidate['party'] = common.normalize_party(row[u'政黨'])
        candidate['gender'] = row[u'性別']
        candidate['number'] = row[u'號次']
        candidate['education'] = row[u'學歷']
        candidate['experience'] = row[u'經歷']
        candidate['platform'] = row[u'政見']
        candidate['image'] = "%s/mayors/%s/%s_%04d.jpg" % (
            common.storage_domain(), candidate['election_year'],
            candidate['county'], candidate['number'])
        if position_type == 'mayors':
            candidate[
                'candidate_uid'], created = common.get_or_create_moyor_candidate_uid(
                    c, candidate)
        else:
            candidate[
 def parse_profile(self, response):
     item = response.meta['item']
     nodes = response.xpath('//div[@id="detail"]/table/tr')
     motions, committee_motion, council_motion = [], {}, {}
     for node in nodes:
         if node.xpath('td/text()')[0].re(u'目前處理程序'):
             item['last_action'] = node.xpath('td/text()').extract()[1]
         elif node.xpath('td/text()')[0].re(u'案由'):
             item['abstract'] = node.xpath('td/text()').extract()[1]
         elif node.xpath('td/text()')[0].re(u'提案人'):
             item['proposed_by'] = node.xpath(
                 'td/div/text()').extract()[0].strip().split(u'、')
         elif node.xpath('td/text()')[0].re(u'召集人/委員'):
             item['proposed_by'] = node.xpath(
                 'td/text()').extract()[1].strip().split(u'、')
         elif node.xpath('td/text()')[0].re(u'議決會次'):
             council_motion['motion'] = u'大會議決'
             council_motion['date'] = common.ROC2AD(
                 node.xpath('td/text()').extract()[1].split()[0])
             council_motion['sitting'] = ''.join(
                 node.xpath('td/text()').extract()[1].split()[1:])
         elif node.xpath('td/text()')[0].re(u'議決文'):
             council_motion['resolution'] = node.xpath(
                 'td/text()').extract()[1]
         elif node.xpath('td/text()')[0].re(u'案(\s| )+?號'):
             item['bill_no'] = node.xpath('td/text()').extract()[1].strip()
         elif node.xpath('td/text()')[0].re(u'來文文號'):
             td = node.xpath('td/text()').extract()[1].split()
             d = dict(
                 zip(['motion', 'resolution', 'date'],
                     [u'來文', None, common.ROC2AD(td[0])]))
             if len(td) > 1:
                 d['no'] = td[1]
             motions.append(d)
         elif node.xpath('td/text()')[0].re(u'收文日期'):
             motions.append(
                 dict(
                     zip(['motion', 'resolution', 'date'], [
                         u'收文', None,
                         common.ROC2AD(
                             node.xpath('td/text()').extract()[1])
                     ])))
         elif node.xpath('td/text()')[0].re(u'審查日期'):
             committee_motion['motion'] = u'委員會審查意見'
             committee_motion['date'] = common.ROC2AD(
                 node.xpath('td/text()').extract()[1])
         elif node.xpath('td/text()')[0].re(u'審查意見'):
             committee_motion['resolution'] = '\n'.join(
                 node.xpath('td/text()').extract()[1:])
         elif node.xpath('td/text()')[0].re(u'發文文號'):
             td = node.xpath('td/text()').extract()[1].split()
             d = dict(
                 zip(['motion', 'resolution', 'date'],
                     [u'發文', None, common.ROC2AD(td[0])]))
             if len(td) > 1:
                 d['no'] = td[1]
             motions.append(d)
         elif node.xpath('td/text()')[0].re(u'執行情形'):
             item['execution'] = node.xpath('td/text()').extract()[1]
         elif node.xpath('td/text()')[0].re(u'備[\s]*?註'):
             item['remark'] = '\n'.join(
                 node.xpath('td/text()').extract()[1:])
     for motion in [committee_motion, council_motion]:
         if motion:
             motions.append(motion)
     item['motions'] = sorted(motions,
                              key=lambda x: x.get('date'),
                              reverse=True)
     item['links'] = [{'url': response.url, 'note': 'original'}]
     return item
Example #11
0
meetings = json.load(
    open('../../../data/kmc/meeting_minutes-%s.json' % election_year))
for meeting in meetings:
    total_text = unicodedata.normalize(
        'NFC',
        codecs.open(
            '../../../data/kmc/meeting_minutes/%s/%s.txt' %
            (election_year, meeting['sitting']), "r", "utf-8").read())
    total_text = re.sub(u'.', u'‧', total_text)
    total_text = re.sub(u' ', ' ', total_text)
    match = Session_Token.search(meeting['sitting'])
    if match:
        for i, session in enumerate(Present_Token.finditer(total_text), 1):
            meeting['date'] = common.ROC2AD(
                re.search(u'時\s*間[::](.*)',
                          total_text[:session.start()].strip().split(
                              '\n')[-1]).group(1))
            if match.group('type') == u'臨時':
                uid = '%s-%s-T%s-CS-%02d' % (county_abbr3, election_year,
                                             match.group('session'), i)
            else:
                uid = '%s-%s-%s-CS-%02d' % (county_abbr3, election_year,
                                            match.group('session'), i)
            sitting = {
                "uid": uid,
                "name": u'%s議會%s第%d次會議' % (county, meeting['sitting'], i),
                "county": county,
                "election_year": election_year,
                "session": match.group('session'),
                "date": meeting['date']
            }
 def parse_profile(self, response):
     item = response.meta['item']
     item_ad = response.css(u'#lbFmotion_expireb::text').extract_first()
     for election_year, ad in self.ads.items():
         if int(item_ad) == ad:
             item['election_year'] = election_year
             break
     if item['election_year'] != self.election_year:
         return
     for key, label in [('bill_id', u'lbFmotion_No'), ('type', u'lbFmotion_Category'), ('category', u'lbFmotion_Class'), ('abstract', u'lbFmotion_From'), ('description', u'lbFmotion_Reason'), ('methods', u'lbFmotion_Way')]:
         content = response.css(u'#%s::text' % label).extract_first()
         if content:
             item[key] = content.strip()
     item['proposed_by'] = re.split(u'\s|、', re.sub(u'(副?議長|議員)', '', response.css(u'#lbFmotion_People::text').extract_first()).strip())
     item['petitioned_by'] = re.split(u'\s|、', re.sub(u'(副?議長|議員)', '', (response.css(u'#lbFmotion_AddTo::text').extract_first() or '')).strip())
     item['motions'] = []
     for motion, label in [(u'大會審議', 'lbFmotion_0'), (u'程序會審定', 'lbFmotion_v'), (u'大會決定', 'lbFmotion_1'), (u'分組審查', 'lbFmotion_g'), (u'大會決議', 'lbFmotion_2')]:
         date = response.css(u'#%sdate::text' % label).extract_first()
         resolution = response.css(u'#%sopinion::text' % label).extract_first()
         if date and resolution:
             item['motions'].append(dict(zip(['motion', 'resolution', 'date'], [motion, resolution.strip(), common.ROC2AD(date)])))
     item['links'] = [
         {
             'url': response.url,
             'note': 'original'
         }
     ]
     return item