def extract_topics(cell): topics = [] for li in cell.xpath('.//li'): topics.append(clean_topic(extract_text(li))) if len(topics) == 0: topics.append(clean_topic(extract_text(cell))) return topics
def parse_keika(self, response): url = build_url(response.url, title=UrlTitle.IINKAI_KEIKA, domain=self.domain) self.gql_client.merge(url) contents = response.xpath('//div[@id="ContentsBox"]') h2_text = contents.xpath('.//h2/text()').get() assert h2_text[-2:] == '経過' committee_name = '参議院' + h2_text[:-2] h4_list = contents.xpath('./h4') pre_list = contents.xpath('./pre') assert len(h4_list) == len(pre_list) for h4, pre in zip(h4_list, pre_list): dt = DateConverter.convert(extract_text(h4)) summary = ''.join(extract_text(pre).strip().split()) if '誤りにつき訂正' in summary: LOGGER.warning(f'skip non summary: {summary}') continue minutes_list = self.minutes_finder.find(committee_name, dt) if len(minutes_list) != 1: LOGGER.warning( f'found {len(minutes_list)} Minutes that match with ({committee_name}, {dt}): {minutes_list}' ) for minutes in minutes_list: minutes.summary = summary self.gql_client.merge(minutes) self.gql_client.link(url.id, minutes.id)
def parse_meisai_table(table): data = dict() for row in table.xpath('./tr'): key = extract_text(row.xpath('./th')) val = extract_text(row.xpath('./td')) data[key] = val return data
def scrape_bills_and_urls_from_table(self, table, bill_category, response_url): bills, urls = [], [] for row in table.xpath('./tr')[1:]: # skip header cells = row.xpath('./td') assert len(cells) == 5 # build Bill instance with necessary info try: diet_number = int(extract_text(cells[0])) submission_number = int(extract_text(cells[1])) bill_name = extract_text(cells[2]) except Exception as e: LOGGER.warning(f'failed to parse row:\n{row.get()}\n{e}') continue bill = build_bill(bill_category, diet_number, submission_number, bill_name) bills.append(bill) # build URL if exists maybe_meisai_href = extract_full_href_or_none( cells[2], response_url) if maybe_meisai_href: url = build_url(maybe_meisai_href, UrlTitle.GIAN_ZYOUHOU, self.domain) url.meta = {'bill_id': bill.id} urls.append(url) return bills, urls
def scrape_bills_and_urls_from_table(table, response_url): def get_bill_category_or_none(caption): if caption == '閣法の一覧': return BillCategory.KAKUHOU elif caption == '衆法の一覧': return BillCategory.SHUHOU elif caption == '参法の一覧': return BillCategory.SANHOU else: return None bills, urls = [], [] caption = extract_text(table.xpath('./caption')).strip() maybe_bill_category = get_bill_category_or_none(caption) if not maybe_bill_category: return bills, urls bill_category = maybe_bill_category for row in table.xpath('./tr')[1:]: # skip header cells = row.xpath('./td') assert len(cells) == 6 # build Bill instance with necessary info try: diet_number = int(extract_text(cells[0])) submission_number = int(extract_text(cells[1])) bill_name = extract_text(cells[2]) except Exception as e: LOGGER.warning(f'failed to parse row:\n{row.get()}\n{e}') continue bill = build_bill(bill_category, diet_number, submission_number, bill_name) bills.append(bill) # build keika URL if exists maybe_keika_href = extract_full_href_or_none( cells[4], response_url) if maybe_keika_href: url = build_url(maybe_keika_href, UrlTitle.KEIKA, ShugiinSpider.domain) url.meta = {'bill_id': bill.id} urls.append(url) # build honbun URL if exists maybe_honbun_href = extract_full_href_or_none( cells[5], response_url) if maybe_honbun_href: url = build_url(maybe_honbun_href, UrlTitle.HONBUN, ShugiinSpider.domain) url.meta = {'bill_id': bill.id} urls.append(url) return bills, urls
def parse_minutes(self, response): # merge url if exists maybe_href = extract_full_href_or_none(response.xpath('//h4'), response.url) if not maybe_href: LOGGER.warning(f'failed to find url in {response.url}') return url = build_url(maybe_href, title=UrlTitle.GAIYOU_PDF, domain=self.domain) self.gql_client.merge(url) LOGGER.debug(f'merged {url.id}') # link to minutes title = extract_text(response.xpath('//title')) committee_name = response.meta['committee_name'] date_time = self.extract_datetime_from_title(title) minutes = build_minutes(committee_name, date_time) try: self.gql_client.get(minutes.id, ['id']) # minutes should already exist self.gql_client.link(url.id, minutes.id) except GraphQLException: LOGGER.warning( f'failed to find minutes ({committee_name}, {date_time})')
def scrape_bills_and_urls(self, response): def get_bill_category_or_none(caption): if caption == '法律案(内閣提出)一覧': return BillCategory.KAKUHOU elif caption == '法律案(衆法)一覧': return BillCategory.SHUHOU elif caption == '法律案(参法)一覧': return BillCategory.SANHOU else: return None bills, urls = [], [] div = response.xpath('//div[@id="ContentsBox"]')[0] tables = div.xpath('./table') captions = list( map(lambda x: extract_text(x), div.css('h2.title_text'))) assert len(tables) == len(captions) for table, caption in zip(tables, captions): maybe_bill_category = get_bill_category_or_none(caption) if maybe_bill_category: res = self.scrape_bills_and_urls_from_table( table, maybe_bill_category, response.url) bills.extend(res[0]) urls.extend(res[1]) return bills, urls
def scrape_committees_from_table(table): committees = [] for row in table.xpath('.//tr')[1:]: # skip header cells = row.xpath('.//td') assert len(cells) == 3 try: committee_name = '衆議院' + extract_text(cells[0]).strip() num_members = int(extract_text(cells[1]).replace('人', '')) topics = ShugiinCommitteeSpider.extract_topics(cells[2]) except Exception as e: LOGGER.warning(f'failed to parse row:\n{row.get()}\n{e}') continue committee = build_committee(committee_name, 'REPRESENTATIVES') committee.num_members = num_members committee.topics = topics committees.append(committee) return committees
def scrape_topics_list(div): ret = [] for oul in div.css('ol, ul'): topics = [] for li in oul.css('li'): topics.append(clean_topic(extract_text(li))) ret.append(topics) return ret
def scrape_committees_from_table(table, root_url): committees = [] for row in table.xpath('./tr'): for cell in row.xpath('./td'): committee = Committee(None) committee.name = '衆議院' + extract_text(cell.xpath('./span/a')) committee.url = extract_full_href_or_none(cell, root_url) committees.append(committee) return committees
def scrape_num_members_list(div): ret = [] for p in div.css('p'): text = extract_text(p) pattern = r'委員数:([0-9]+)人' match = re.fullmatch(pattern, text) if match: ret.append(int(match.group(1))) return ret
def extract_urls(self, cell): urls = [] for a in cell.xpath('.//a'): text = extract_text(a) href = urljoin(self.start_urls[0], a.xpath('./@href').get()) if '概要' in text: urls.append(build_url(href, UrlTitle.GAIYOU_PDF, self.domain)) elif '新旧' in text: urls.append(build_url(href, UrlTitle.SINKYU_PDF, self.domain)) return urls
def parse_table(self, table, bill_category=None, diet_number=None): for row in table.xpath('.//tr'): cells = row.xpath('.//td') if len(cells) > max(self.bill_col, self.url_col): try: bill_query = extract_text(cells[self.bill_col]).strip() urls = self.extract_urls(cells[self.url_col]) LOGGER.info(f'scraped {len(urls)} urls for {bill_query}') self.store_urls_for_bill(urls, bill_query, bill_category, diet_number) except Exception as e: LOGGER.warning(f'failed to parse {row}: {e}') continue
def parse_table(self, response): table = response.xpath('//table')[self.table_idx] for row in table.xpath('.//tr'): cells = row.xpath('.//td') if len(cells) > max(self.bill_col, self.url_col): try: bill_query = extract_text(cells[self.bill_col]).strip() urls = self.extract_urls(cells[self.url_col]) self.store_urls_for_bill(urls, bill_query) LOGGER.info(f'scraped {len(urls)} urls for {bill_query}') except Exception as e: LOGGER.warning(f'failed to parse {row}: {e}') continue
def scrape_members_and_urls(self, response): members, urls = [], [] table = response.xpath('//table[@summary="議員一覧(50音順)"]')[0] for row in table.xpath('./tr')[1:]: # skip header cells = row.xpath('./td') assert len(cells) == 6 name = ''.join(extract_text(cells[0]).strip().split()) tags = [ # store 会派 and 選挙区 as tags for now extract_text(cells[2]).strip(), extract_text(cells[3]).strip() ] member = build_member(name) member.tags = tags member.house = 'COUNCILORS' members.append(member) maybe_href = extract_full_href_or_none(cells[0], response.url) if maybe_href: url = build_url(maybe_href, UrlTitle.GIIN_ZYOUHOU, self.domain) url.meta = {'member_id': member.id} urls.append(url) return members, urls
def scrape_name_list(div): ret = [] for h4 in div.css('h4.ta_l').css('h4.mt20').css('h4.fl_l'): name = '参議院' + extract_text(h4).strip() ret.append(name) return ret