def fetch_group(self, group_url): group_page = self.fetch_url(group_url) headline = group_page.find("td.headline") parent_td = pq(headline.parents("td")[-1]) mp_tables = list(parent_td.items("table table")) short_name = group_page.find(".cale2").text().split(">")[-1].strip() group = Group( is_independent=False, current_members=[], former_members=[], name=headline.text(), short_name=short_name ) membership_parser = GroupMembershipParser() idg = url_args(group_url).get("idg", type=int) if idg == 0: # group of unaffiliated MPs group.is_independent = True group.short_name = "Indep." group.current_members.extend(membership_parser.parse_table(mp_tables[0])) if len(mp_tables) > 1: group.former_members.extend(membership_parser.parse_table(mp_tables[-1])) for member in group.current_members + group.former_members: member.group = group return group
def fetch_group(self, group_url): group_page = self.fetch_url(group_url) headline = group_page.find('td.headline') parent_td = pq(headline.parents('td')[-1]) mp_tables = list(parent_td.items('table table')) short_name = group_page.find('.cale2').text().split('>')[-1].strip() group = Group( is_independent=False, current_members=[], former_members=[], name=headline.text(), short_name=short_name, ) idg = url_args(group_url).get('idg', type=int) if idg == 0: # group of unaffiliated MPs group.is_independent = True group.short_name = "Indep." group.current_members.extend( self.fetch_current_independent_members(mp_tables[0])) else: group.current_members.extend( self.fetch_current_members(mp_tables[0])) if len(mp_tables) > 1: group.former_members.extend( self.fetch_former_members(mp_tables[-1])) for member in group.current_members + group.former_members: member.group = group return group
def fetch_committees(self): for chamber_id in [0, 1, 2]: if chamber_id == 1: for (id, name) in SENATE_2016_COMMITTEES: yield Committee( cdep_id=id, chamber_id=1, name=name, current_members=[], former_members=[], ) continue url = self.listing_page_url.format(chamber_id=chamber_id) listing_page = self.fetch_url(url) for row in listing_page.items('table.tip01 tr[valign=top]'): cell = row('td').eq(1) link = cell('a').eq(0) href = link.attr('href') assert href.startswith(self.committee_url_prefix) args = url_args(href) assert args['leg'] == '2016' assert args['cam'] == str(chamber_id) committee = Committee( cdep_id=int(args['idc']), chamber_id=chamber_id, name=link.text(), current_members=[], former_members=[], ) if chamber_id != 1: self.fetch_committee_members(committee, href, chamber_id) yield committee
def list_proposals(self, cam, year=None): list_url = self.list_url.format(cam=cam) if year: list_url += '&anp=%s' % year page = self.fetch_url(list_url) table = page.find('p[align=center]').next() for tr in pqitems(table, 'tr[valign=top]'): td_list = tr.find('td') link = td_list.eq(1).find('a') args = url_args(link.attr('href')) assert args.get('cam', type=int) == cam date_txt = td_list.eq(3).text() try: date = extract_modification_date(date_txt) except: logger.warn("Can't parse modification date %r" % date_txt) continue pk = args.get('idp', type=int) if (cam, pk) in [(1, 282), (1, 283)]: continue yield { 'pk': pk, 'chamber': cam, 'date': date, }
def scrape_day(self, day): url = self.DAY_URL % day.strftime('%Y%m%d') page = self.fetch_url(url) table = page.find('#pageContent table') for link in table.items('td:nth-child(1) a'): href = link.attr('href') assert href.startswith('http://www.cdep.ro/pls/' 'steno/evot.nominal?idv=') vote_cdeppk = url_args(href).get('idv', type=int) yield self.scrape_vote(vote_cdeppk)
def fetch_committees(self): for chamber_id in [0, 1, 2]: url = self.listing_page_url.format(chamber_id=chamber_id) listing_page = self.fetch_url(url) for row in listing_page.items('table.tip01 tr[valign=top]'): cell = row('td').eq(1) link = cell('a').eq(0) href = link.attr('href') assert href.startswith(self.committee_url_prefix) args = url_args(href) assert args['leg'] == '2012' assert args['cam'] == str(chamber_id) yield Committee( cdep_id=int(args['idc']), chamber_id=chamber_id, name=link.text(), )
def fetch_committees(self): for chamber_id in [0, 1, 2]: url = self.listing_page_url.format(chamber_id=chamber_id) listing_page = self.fetch_url(url) for row in listing_page.items('table.tip01 tr[valign=top]'): cell = row('td').eq(1) link = cell('a').eq(0) href = link.attr('href') assert href.startswith(self.committee_url_prefix) args = url_args(href) assert args['leg'] == '2012' assert args['cam'] == str(chamber_id) yield Committee( cdep_id = int(args['idc']), chamber_id = chamber_id, name = link.text(), )
def scrape_vote(self, vote_cdeppk): url = self.VOTE_URL % vote_cdeppk page = self.fetch_url(url) subject_label = list(page.items(':contains("Subiect vot:")'))[0] subject_td = list(subject_label.parent().items('td'))[1] voting_session = VotingSession( cdeppk=vote_cdeppk, subject=subject_td.text(), subject_html=subject_td.html(), votes=[], proposal_cdeppk=None, ) proposal_link = subject_td.find('a[target=PROIECTE]') if proposal_link: href = proposal_link.attr('href') if href.startswith('http://www.cdep.ro/pls/proiecte' '/upl_pck.proiect?idp='): args = url_args(href) voting_session.proposal_cdeppk = args.get('idp', type=int) td_nr_crt = page.find(':contains("Nr. Crt.")') table = pq(td_nr_crt.parents().filter('table')[-1]) role_call = bool(voting_session.subject.startswith('Prezenţă')) for row in list(table.items('tr'))[1:]: link = row.find('a') (year, chamber, number) = parse_profile_url(link.attr('href')) assert chamber == 2 choice_td = list(row.items('td'))[-1] vote = Vote( mandate_year=year, mandate_number=number, mandate_name=link.text(), choice=None, ) if not role_call: vote.choice = parse_choice(choice_td.text()) voting_session.votes.append(vote) return voting_session
def scrape_vote(self, vote_cdeppk): url = self.VOTE_URL % vote_cdeppk page = self.fetch_url(url) subject_label = list(page.items(':contains("Subiect vot:")'))[0] subject_td = list(subject_label.parent().items('td'))[1] voting_session = VotingSession( cdeppk=vote_cdeppk, subject=subject_td.text(), subject_html=subject_td.html(), votes=[], proposal_cdeppk=None, ) proposal_link = subject_td.find('a[target=PROIECTE]') if proposal_link: href = proposal_link.attr('href') if href.startswith('http://www.cdep.ro/pls/proiecte' '/upl_pck.proiect?idp='): args = url_args(href) voting_session.proposal_cdeppk = args.get('idp', type=int) td_nr_crt = page.find(':contains("Nr. Crt.")') table = pq(td_nr_crt.parents().filter('table')[-1]) for row in list(table.items('tr'))[1:]: link = row.find('a') (year, chamber, number) = parse_profile_url(link.attr('href')) assert chamber == 2 choice_td = list(row.items('td'))[-1] vote = Vote( mandate_year=year, mandate_number=number, mandate_name=link.text(), choice=None, ) if voting_session.subject != 'Prezenţă': vote.choice = parse_choice(choice_td.text()) voting_session.votes.append(vote) return voting_session
def scrape_proposal_page(self, chamber, pk): rv = {} url = ( 'http://www.cdep.ro/pls/proiecte/upl_pck.proiect?idp=%d&cam=%d' % (pk, chamber) ) page = self.fetch_url(url) if chamber == 1: rv['pk_senate'] = pk else: rv['pk_cdep'] = pk rv['title'] = pq('.headline', page).text() rv['sponsorship'] = [] [hook_td] = pqitems(page, ':contains("Nr. înregistrare")') metadata_table = pq(hook_td.parents('table')[-1]) date_texts = [] for row in pqitems(metadata_table.children('tr')): cols = row.children() label = cols.eq(0).text().strip() val_td = cols.eq(1) if len(cols) > 1 else None if label == "- B.P.I.:": txt = val_td.text() rv['number_bpi'] = ' '.join( parse_proposal_number(t)[1] for t in txt.split() ) date_texts.append(txt.split()[0]) elif label == "- Camera Deputatilor:": txt = val_td.text() rv['number_cdep'] = parse_proposal_number(txt)[1] date_texts.append(txt) link = val_td.find('a') if link: args = url_args(link.attr('href')) assert args.get('cam', '2') == '2' rv['pk_cdep'] = args.get('idp', type=int) elif label == "- Senat:": txt = val_td.text() rv['number_senate'] = parse_proposal_number(txt)[1] date_texts.append(txt) link = val_td.find('a') if link: args = url_args(link.attr('href')) assert args.get('cam') == '1' rv['pk_senate'] = args.get('idp', type=int) elif label == "Tip initiativa:": rv['proposal_type'] = val_td.text() elif label == "Consultati:": for tr in pqitems(val_td, 'tr'): if tr.text() == "Forma iniţiatorului": [a] = pqitems(tr, 'a') href = a.attr('href') rv['pdf_url'] = href elif label == "Camera decizionala:": txt = val_td.text() if txt == 'Camera Deputatilor': rv['decision_chamber'] = 'cdep' elif txt == 'Senatul': rv['decision_chamber'] = 'senat' elif txt == 'Camera Deputatilor + Senatul': rv['decision_chamber'] = 'common' elif txt == '-': rv['decision_chamber'] = None else: logger.warn("Unknown decision_chamber %r", txt) elif label == "Stadiu:": rv['status_text'] = val_td.text() elif label == "Initiator:": for link in pqitems(val_td, 'a'): args = url_args(link.attr('href')) if args.get('cam', 2, type=int) == 2: cdep_id = ( args.get('leg', type=int), args.get('idm', type=int), ) rv['sponsorship'].append(cdep_id) rv['activity'] = self.get_activity(page) rv['date'] = get_date_from_numbers(date_texts) if rv['date'] is None: rv['date'] = rv['activity'][0].date return rv
def fetch_group(self, group_url, year): group_page = self.fetch_url(group_url) headline = group_page.find('td.headline') parent_td = pq(headline.parents('td')[-1]) mp_tables = list(parent_td.items('table table')) short_name = group_page.find('.cale2').text().split('>')[-1].strip() group = Group( idg=url_args(group_url).get('idg', type=int), is_independent=False, current_members=[], former_members=[], name=headline.text(), short_name=short_name, year=year, ) membership_parser = GroupMembershipParser() if group.idg == 0: # group of unaffiliated MPs group.is_independent = True group.short_name = "Indep." group.current_members.extend( membership_parser.parse_table(mp_tables[0])) if len(mp_tables) > 1: headline = mp_tables[-1].prev().prev() if "Foşti membri ai grupului" in headline.text(): group.former_members.extend( membership_parser.parse_table(mp_tables[-1])) to_remove = [] for member in group.current_members + group.former_members: member.group = group if year == 2000 and member.mp_ident.number == 170: member.mp_name = "Mălaimare Mihai" if year == 2004 and member.mp_ident.number == 58: member.mp_name = "Chiper Constantin Cătălin" if year == 2004 and member.mp_ident.number == 88: member.mp_name = "Mălaimare Mihai" if year == 2004 and member.mp_ident.number == 329: member.mp_name = "Bónis István" if year == 2008 and group.idg == 6: if member.start_date is None: member.start_date = date(2011, 9, 5) if year == 2008 and group.idg == 7: if member.end_date is None: to_remove.append(member) if year == 2008 and member.mp_name == "Balcan Viorel": if group.idg == 3: member.start_date = date(2012, 9, 3) for member in to_remove: group.current_members.remove(member) if year == 2008 and group.idg == 0: group.current_members.append(Member( role='Membri', mp_name="Cherecheş Cătălin", mp_ident=ProfileIdent(year=2008, chamber=2, number=65), group=group, start_date=date(2010, 9, 8), end_date=date(2011, 5, 30), )) group.current_members.append(Member( role='Membri', mp_name="Boldea Mihail", mp_ident=ProfileIdent(year=2008, chamber=2, number=38), group=group, start_date=date(2012, 3, 19), end_date=date(2012, 9, 25), )) return group