def fetch_current_members(self, table): current_title = None rows = list(table.items('tr')) cols = {k: n for n, k in enumerate(self.parse_cols(rows[0]))} for row in rows[1:]: row_children = row.children() next_title = row_children.eq(cols['title']).text() if next_title: current_title = next_title name_link = row_children.eq(cols['person']).find('a') member = Member( title=current_title, mp_name=name_link.text(), mp_ident=parse_profile_url(name_link.attr('href')), start_date=None, end_date=None, ) if 'start_date' in cols: date_txt = row_children.eq(cols['start_date']).text() if date_txt: member.start_date = parse_date(date_txt) yield member
def fetch_current_independent_members(self, table): rows = list(table.items('tr')) cols = {k: n for n, k in enumerate(self.parse_cols(rows[0]))} for row in rows[1:]: row_children = row.children() name_link = row_children.eq(cols['person']).find('a') member = Member( mp_name=name_link.text(), mp_ident=parse_profile_url(name_link.attr('href')), start_date=None, end_date=None, ) yield member
def parse_transcript_page(self, link): page = self.fetch_url(link) table_rows = pqitems(page, '#pageContent > table tr') transcript = None transcript_chapter = Chapter() def save_paragraph(): text = "\n".join(transcript.pop('text_buffer')) transcript['text'] = text transcript_chapter.paragraphs.append(transcript) for tr in table_rows: for td in pqitems(tr, 'td'): for paragraph in pqitems(td, 'p'): speakers = paragraph('b font[color="#0000FF"]') if speakers: if transcript: save_paragraph() serial = self.next_paragraph_serial() assert len(speakers) == 1 speaker_name = self.trim_name(speakers.text()) link = speakers.parents('a') if not link: transcript = None continue (year, chamber, number) = \ parse_profile_url(link.attr('href')) transcript = Transcript({ 'mandate_year': year, 'mandate_chamber': chamber, 'mandate_number': number, 'speaker_name': speaker_name, 'text_buffer': [], 'serial': serial, }) else: if transcript is None: continue text = paragraph.text() transcript['text_buffer'].append(text) if transcript: save_paragraph() return transcript_chapter
def scrape_vote(self, vote_cdeppk): url = self.VOTE_URL % vote_cdeppk page = self.fetch_url(url) subject_label = list(page.items(':contains("Subiect vot:")'))[0] subject_td = list(subject_label.parent().items('td'))[1] voting_session = VotingSession( cdeppk=vote_cdeppk, subject=subject_td.text(), subject_html=subject_td.html(), votes=[], proposal_cdeppk=None, ) proposal_link = subject_td.find('a[target=PROIECTE]') if proposal_link: href = proposal_link.attr('href') if href.startswith('http://www.cdep.ro/pls/proiecte' '/upl_pck.proiect?idp='): args = url_args(href) voting_session.proposal_cdeppk = args.get('idp', type=int) td_nr_crt = page.find(':contains("Nr. Crt.")') table = pq(td_nr_crt.parents().filter('table')[-1]) role_call = bool(voting_session.subject.startswith('Prezenţă')) for row in list(table.items('tr'))[1:]: link = row.find('a') (year, chamber, number) = parse_profile_url(link.attr('href')) assert chamber == 2 choice_td = list(row.items('td'))[-1] vote = Vote( mandate_year=year, mandate_number=number, mandate_name=link.text(), choice=None, ) if not role_call: vote.choice = parse_choice(choice_td.text()) voting_session.votes.append(vote) return voting_session
def scrape_vote(self, vote_cdeppk): url = self.VOTE_URL % vote_cdeppk page = self.fetch_url(url) subject_label = list(page.items(':contains("Subiect vot:")'))[0] subject_td = list(subject_label.parent().items('td'))[1] voting_session = VotingSession( cdeppk=vote_cdeppk, subject=subject_td.text(), subject_html=subject_td.html(), votes=[], proposal_cdeppk=None, ) proposal_link = subject_td.find('a[target=PROIECTE]') if proposal_link: href = proposal_link.attr('href') if href.startswith('http://www.cdep.ro/pls/proiecte' '/upl_pck.proiect?idp='): args = url_args(href) voting_session.proposal_cdeppk = args.get('idp', type=int) td_nr_crt = page.find(':contains("Nr. Crt.")') table = pq(td_nr_crt.parents().filter('table')[-1]) for row in list(table.items('tr'))[1:]: link = row.find('a') (year, chamber, number) = parse_profile_url(link.attr('href')) assert chamber == 2 choice_td = list(row.items('td'))[-1] vote = Vote( mandate_year=year, mandate_number=number, mandate_name=link.text(), choice=None, ) if voting_session.subject != 'Prezenţă': vote.choice = parse_choice(choice_td.text()) voting_session.votes.append(vote) return voting_session
def parse_mandates(self, table, ended=False): row_list = list(table.children().items()) uninominal = bool('Colegiul uninominal' in row_list[1].text()) if uninominal: college_col = 4 party_col = 5 else: college_col = None party_col = 4 has_start_date = bool('Membru din' in row_list[0].text()) for row in row_list[2:]: cols = row.children() link = cols.eq(1).find('a') (year, chamber, number) = parse_profile_url(link.attr('href')) person_page = self.fetch_url(link.attr('href')) picture = person_page.find('a.highslide') mandate = Mandate( year=year, chamber_number=chamber, cdep_number=number, person_name=link.text(), minority=False, end_date=None, picture_url=picture.attr('href'), ) if (cols.eq(2).text() in ["ales la nivel naţional", ""] and cols.eq(3).text() in ["Mino.", "Minoritati", u"Minorităţi"]): mandate.minority = True else: mandate.constituency = int(cols.eq(2).text()) if college_col: mandate.college = int(cols.eq(college_col).text()) else: mandate.college = None mandate.party_name = cols.eq(party_col).text() county_name = fix_local_chars(cols.eq(3).text().title()) if county_name == "Bistrița-Năsăud": county_name = "Bistrița Năsăud" mandate.county_name = county_name if ended: end_date_col = 6 if mandate.minority: end_date_col -= 1 if not has_start_date: end_date_col -= 1 if uninominal and not mandate.minority: end_date_col += 1 mandate.end_date = parse_date(cols.eq(end_date_col).text()) if (mandate.year, mandate.cdep_number) == (2004, 88): mandate.person_name = u"Mălaimare Mihai Adrian" yield mandate
def parse_mandates(self, table, ended=False): row_list = list(table.children().items()) uninominal = bool('Colegiul uninominal' in row_list[1].text()) if uninominal: college_col = 4 party_col = 5 else: college_col = None party_col = 4 has_start_date = bool('Membru din' in row_list[0].text()) for row in row_list[2:]: cols = row.children() link = cols.eq(1).find('a') (year, chamber, number) = parse_profile_url(link.attr('href')) last_first = link.text() person_page = self.fetch_url(link.attr('href')) picture = person_page.find('a.highslide') first_last = ( person_page.find('.headline').html() .split('<br/>')[0] .split(',')[0] .split('\xa0\xa0\xa0\xa0')[0] ) (first_name, last_name) = match_split_name(last_first, first_last) mandate = Mandate( year=year, chamber_number=chamber, cdep_number=number, person_name=last_first, person_first_name=first_name, person_last_name=last_name, minority=False, end_date=None, picture_url=picture.attr('href'), ) if (cols.eq(2).text() in ["ales la nivel naţional", ""] and cols.eq(3).text() in ["Mino.", "Minoritati", u"Minorităţi"]): mandate.minority = True else: mandate.constituency = int(cols.eq(2).text()) if college_col: mandate.college = int(cols.eq(college_col).text()) else: mandate.college = None mandate.party_name = cols.eq(party_col).text() county_name = fix_local_chars(cols.eq(3).text().title()) if county_name == "Bistrița-Năsăud": county_name = "Bistrița Năsăud" mandate.county_name = county_name if ended: end_date_col = 6 if mandate.minority: end_date_col -= 1 if not has_start_date: end_date_col -= 1 if uninominal and not mandate.minority: end_date_col += 1 mandate.end_date = parse_date( cols.eq(end_date_col).text(), fmt='ro_short_month', ) yield mandate