def extract_urls(self, cell): urls = [] for a in cell.xpath('.//a'): text = extract_text(a) href = urljoin(self.start_urls[0], a.xpath('./@href').get()) if '概要' in text: urls.append(build_url(href, UrlTitle.GAIYOU_PDF, self.domain)) elif '新旧' in text: urls.append(build_url(href, UrlTitle.SINKYU_PDF, self.domain)) return urls
def scrape_bills_and_urls_from_table(table, response_url): def get_bill_category_or_none(caption): if caption == '閣法の一覧': return BillCategory.KAKUHOU elif caption == '衆法の一覧': return BillCategory.SHUHOU elif caption == '参法の一覧': return BillCategory.SANHOU else: return None bills, urls = [], [] caption = extract_text(table.xpath('./caption')).strip() maybe_bill_category = get_bill_category_or_none(caption) if not maybe_bill_category: return bills, urls bill_category = maybe_bill_category for row in table.xpath('./tr')[1:]: # skip header cells = row.xpath('./td') assert len(cells) == 6 # build Bill instance with necessary info try: diet_number = int(extract_text(cells[0])) submission_number = int(extract_text(cells[1])) bill_name = extract_text(cells[2]) except Exception as e: LOGGER.warning(f'failed to parse row:\n{row.get()}\n{e}') continue bill = build_bill(bill_category, diet_number, submission_number, bill_name) bills.append(bill) # build keika URL if exists maybe_keika_href = extract_full_href_or_none( cells[4], response_url) if maybe_keika_href: url = build_url(maybe_keika_href, UrlTitle.KEIKA, ShugiinSpider.domain) url.meta = {'bill_id': bill.id} urls.append(url) # build honbun URL if exists maybe_honbun_href = extract_full_href_or_none( cells[5], response_url) if maybe_honbun_href: url = build_url(maybe_honbun_href, UrlTitle.HONBUN, ShugiinSpider.domain) url.meta = {'bill_id': bill.id} urls.append(url) return bills, urls
def parse_minutes(self, response): # merge url if exists maybe_href = extract_full_href_or_none(response.xpath('//h4'), response.url) if not maybe_href: LOGGER.warning(f'failed to find url in {response.url}') return url = build_url(maybe_href, title=UrlTitle.GAIYOU_PDF, domain=self.domain) self.gql_client.merge(url) LOGGER.debug(f'merged {url.id}') # link to minutes title = extract_text(response.xpath('//title')) committee_name = response.meta['committee_name'] date_time = self.extract_datetime_from_title(title) minutes = build_minutes(committee_name, date_time) try: self.gql_client.get(minutes.id, ['id']) # minutes should already exist self.gql_client.link(url.id, minutes.id) except GraphQLException: LOGGER.warning( f'failed to find minutes ({committee_name}, {date_time})')
def parse_keika(self, response): url = build_url(response.url, title=UrlTitle.IINKAI_KEIKA, domain=self.domain) self.gql_client.merge(url) contents = response.xpath('//div[@id="ContentsBox"]') h2_text = contents.xpath('.//h2/text()').get() assert h2_text[-2:] == '経過' committee_name = '参議院' + h2_text[:-2] h4_list = contents.xpath('./h4') pre_list = contents.xpath('./pre') assert len(h4_list) == len(pre_list) for h4, pre in zip(h4_list, pre_list): dt = DateConverter.convert(extract_text(h4)) summary = ''.join(extract_text(pre).strip().split()) if '誤りにつき訂正' in summary: LOGGER.warning(f'skip non summary: {summary}') continue minutes_list = self.minutes_finder.find(committee_name, dt) if len(minutes_list) != 1: LOGGER.warning( f'found {len(minutes_list)} Minutes that match with ({committee_name}, {dt}): {minutes_list}' ) for minutes in minutes_list: minutes.summary = summary self.gql_client.merge(minutes) self.gql_client.link(url.id, minutes.id)
def scrape_bills_and_urls_from_table(self, table, bill_category, response_url): bills, urls = [], [] for row in table.xpath('./tr')[1:]: # skip header cells = row.xpath('./td') assert len(cells) == 5 # build Bill instance with necessary info try: diet_number = int(extract_text(cells[0])) submission_number = int(extract_text(cells[1])) bill_name = extract_text(cells[2]) except Exception as e: LOGGER.warning(f'failed to parse row:\n{row.get()}\n{e}') continue bill = build_bill(bill_category, diet_number, submission_number, bill_name) bills.append(bill) # build URL if exists maybe_meisai_href = extract_full_href_or_none( cells[2], response_url) if maybe_meisai_href: url = build_url(maybe_meisai_href, UrlTitle.GIAN_ZYOUHOU, self.domain) url.meta = {'bill_id': bill.id} urls.append(url) return bills, urls
def scrape_minutes_activities_urls(self, response): date_time, meeting_name = None, None for row in response.xpath('//div[@id="library"]/table//tr'): tds = row.xpath('./td') term = tds[1].xpath('.//text()').get() desc = tds[3].xpath('.//text()').get().split()[0] if term == '開会日': date_time = datetime.strptime(desc, '%Y年%m月%d日') if term == '会議名': meeting_name = self.get_full_meeting_name(desc) if not (date_time and meeting_name): msg = f'failed to extract minutes detail: date_time={date_time}, meeting_name={meeting_name}' raise ValueError(msg) minutes = build_minutes(self.house_name + meeting_name, date_time) tables = response.xpath('//div[@id="library2"]/table') topics = self.scrape_table(tables[0]) if topics: LOGGER.debug(f'scraped topics={topics}') minutes.topics = topics speakers = self.scrape_table(tables[2]) if speakers: LOGGER.debug(f'scraped speakers={speakers}') minutes.speakers = speakers # this field won't be written to GraphQL directly activity_list, url_list = self.build_activities_and_urls( tables.xpath('.//a'), minutes, response.url) url = build_url(response.url, UrlTitle.SHINGI_TYUKEI, self.domain) url.to_id = minutes.id url_list.append(url) return minutes, activity_list, url_list
def scrape_minutes_activities_urls(self, response): date_time, meeting_name = None, None for row in response.xpath('//div[@id="library"]/table//tr'): tds = row.xpath('./td') term = tds[1].xpath('.//text()').get() desc = tds[3].xpath('.//text()').get().split()[0] if term == '開会日': date_time = extract_datetime(desc) if term == '会議名': meeting_name = self.get_full_meeting_name(desc) if not (date_time and meeting_name): msg = f'failed to extract minutes detail: date_time={date_time}, meeting_name={meeting_name}' raise ValueError(msg) minutes = build_minutes(self.house_name + meeting_name, date_time) tables = response.xpath('//div[@id="library2"]/table') topics = self.scrape_table(tables[0]) if topics: LOGGER.debug(f'scraped topics={topics}') minutes.topics = topics minutes.topic_ids = self.get_topic_ids(topics) speakers = self.scrape_table(tables[2], first_section_only=True) speakers = deduplicate(speakers) if speakers: LOGGER.debug(f'scraped speakers={speakers}') minutes.speakers = speakers minutes.speaker_ids = self.get_speakers_ids(speakers) activity_list, url_list = self.build_activities_and_urls(tables.xpath('.//a'), minutes, response.url) url = build_url(response.url, UrlTitle.SHINGI_TYUKEI, self.domain) url.to_id = minutes.id url_list.append(url) return minutes, activity_list, url_list
def scrape_minutes_activities_speeches_urls(self, response_body): minutes_lst, activity_lst, speech_lst, url_lst = [], [], [], [] for meeting_rec in response_body['meetingRecord']: try: minutes = build_minutes( meeting_rec['nameOfHouse'] + meeting_rec['nameOfMeeting'], datetime.strptime(meeting_rec['date'], '%Y-%m-%d')) minutes.ndl_min_id = meeting_rec['issueID'] topics = extract_topics( meeting_rec['speechRecord'][0]['speech']) if topics: minutes.topics = topics except ValueError as e: LOGGER.warning(f'failed to parse minutes: {e}') continue minutes_lst.append(minutes) url = build_url(meeting_rec['meetingURL'], UrlTitle.HONBUN, self.domain) url.to_id = minutes.id url_lst.append(url) speakers = set() for speech_rec in meeting_rec['speechRecord']: speaker = speech_rec['speaker'] speech = build_speech(minutes.id, int(speech_rec['speechOrder'])) speech.speaker_name = speaker if self.collect_speech: speech_lst.append(speech) if speaker not in speakers: speakers.add(speaker) try: member = self.member_finder.find_one(speaker) except Exception: pass else: activity = build_minutes_activity( member.id, minutes.id, minutes.start_date_time) url = build_url(speech_rec['speechURL'], UrlTitle.HONBUN, self.domain) url.to_id = activity.id activity_lst.append(activity) url_lst.append(url) return minutes_lst, activity_lst, speech_lst, url_lst
def parse(self, response): if response.status != 404: name = response.xpath('//h1//text()').get() try: member = self.member_finder.find_one(name) url = build_url(response.url, UrlTitle.VRSDD, self.domain) self.gql_client.merge(url) self.gql_client.link(url.id, member.id) LOGGER.info( f'[{self.next_id}/{self.last_id}] linked {url.id} to {member.id}' ) except Exception: LOGGER.exception(f'failed to process {response.url}') if self.next_id < self.last_id: yield response.follow(self.build_next_url(), callback=self.parse)
def parse(self, response): page_title = response.xpath('//title/text()').get() house_name, meeting_name, date_time = self.parse_page_title(page_title) minutes = build_minutes(house_name + meeting_name, date_time) url = build_url(response.url, UrlTitle.VRSDD, self.domain) LOGGER.info(f'found url for minutes: {minutes}, {url}') try: # do not merge minutes because this is unofficial data source self.delete_old_urls(minutes.id, url.title) self.gql_client.merge(url) self.gql_client.link(url.id, minutes.id) except GraphQLException as e: # expected when official minutes does not exist yet LOGGER.warning(e) if self.next_id < self.last_id: yield response.follow(self.build_next_url(), callback=self.parse)
def build_activities_and_urls(self, atags, minutes, response_url): """ build Minutes Activities from a tags listed in TV page """ activity_list, url_list = [], [] for a in atags: text = a.xpath('./text()').get() href = a.xpath('./@href').get() try: member = self.member_finder.find_one(text) except ValueError as e: LOGGER.debug(e) # this is expected when speaker is not member else: activity = build_minutes_activity(member.id, minutes.id, minutes.start_date_time) url = build_url(urljoin(response_url, href), UrlTitle.SHINGI_TYUKEI, self.domain) url.to_id = activity.id activity_list.append(activity) url_list.append(url) return activity_list, url_list
def scrape_minutes_activities_urls(self, response): content = response.xpath('//div[@id="detail-contents-inner"]') if not content: content = response.xpath('//div[@id="detail-contents-inner2"]') date_time, meeting_name = None, None for dl in content.xpath('//dl'): term = dl.xpath('./dt/text()').get() desc = dl.xpath('./dd/text()').get() if term == '開会日': date_time = extract_datetime(desc) elif term == '会議名': meeting_name = desc.replace('、', '') if not (date_time and meeting_name): msg = f'failed to extract minutes detail: date_time={date_time}, meeting_name={meeting_name}' raise ValueError(msg) minutes = build_minutes(self.house_name + meeting_name, date_time) summary = ''.join( map(lambda x: x.strip(), content.xpath('./span/text()').getall())) if summary: minutes.summary = summary topics = content.xpath('./ul/li/text()').getall() if topics: LOGGER.debug(f'scraped topics={topics}') minutes.topics = topics minutes.topic_ids = self.get_topic_ids(topics) speakers = content.xpath('./ul/li/a/text()').getall() speakers = deduplicate(speakers) if speakers: LOGGER.debug(f'scraped speakers={speakers}') minutes.speakers = speakers minutes.speaker_ids = self.get_speakers_ids(speakers) activity_list, url_list = self.build_activities_and_urls( content.xpath('./ul/li/a'), minutes, response.url) url = build_url(response.url, UrlTitle.SHINGI_TYUKEI, self.domain) url.to_id = minutes.id url_list.append(url) return minutes, activity_list, url_list
def parse_sitsugi(self, response): contents = response.xpath('//div[@id="list-style"]') h3_text = contents.xpath('.//h3/text()').get() committee_name = '参議院' + h3_text.split()[-1] for a in contents.xpath('.//a'): text = a.xpath('./text()').get() href = urljoin(response.url, a.xpath('./@href').get()) try: url = build_url(href, UrlTitle.IINKAI_SITSUGI, self.domain) except Exception as e: LOGGER.error(f'failed to build url from {a} in {response.url}') return self.gql_client.merge(url) dt = DateConverter.convert(text) minutes_list = self.minutes_finder.find(committee_name, dt) if len(minutes_list) != 1: LOGGER.warning( f'found {len(minutes_list)} Minutes that match with ({committee_name}, {dt}): {minutes_list}' ) for minutes in minutes_list: self.gql_client.link(url.id, minutes.id)
def scrape_members_and_urls(self, response): members, urls = [], [] table = response.xpath('//table[@summary="議員一覧(50音順)"]')[0] for row in table.xpath('./tr')[1:]: # skip header cells = row.xpath('./td') assert len(cells) == 6 name = ''.join(extract_text(cells[0]).strip().split()) tags = [ # store 会派 and 選挙区 as tags for now extract_text(cells[2]).strip(), extract_text(cells[3]).strip() ] member = build_member(name) member.tags = tags member.house = 'COUNCILORS' members.append(member) maybe_href = extract_full_href_or_none(cells[0], response.url) if maybe_href: url = build_url(maybe_href, UrlTitle.GIIN_ZYOUHOU, self.domain) url.meta = {'member_id': member.id} urls.append(url) return members, urls
def parse_items(self): for item in self.items: bill_query = item['bill'] urls = [build_url(item['url'], item['title'], self.domain)] self.store_urls_for_bill(urls, bill_query) LOGGER.info(f'merged {len(self.items)} urls')