def get_question(self, href): page = self.fetch_url(href) heading = page('#pageHeader .pageHeaderLinks').text() heading_m = self.title_pattern.match(heading) assert heading_m is not None, "Could not parse heading %r" % heading question = {} question['type'] = self.types[heading_m.group('type')] question['url'] = href question['title'] = page('.headline').text() rows = iter(pqitems(page, '#pageContent > dd > table > tr')) assert (self.normalize_space( next(rows).text()) == 'Informaţii privind interpelarea') question['pdf_url'] = None question['addressee'] = [] question['method'] = None question['person'] = [] label_text = None for row in rows: norm_text = self.normalize_space(row.text()) if norm_text == '': continue elif norm_text == 'Informaţii privind răspunsul': break [label, value] = [pq(el) for el in row[0]] new_label_text = label.text() if new_label_text: label_text = new_label_text else: if label_text not in ['Adresanţi:', 'Destinatari:']: continue if label_text == 'Nr.înregistrare:': question['number'] = value.text() elif label_text == 'Data înregistrarii:': question['date'] = self.parse_date_dmy(value.text()) elif label_text == 'Mod adresare:': question['method'] = value.text() elif label_text in ['Destinatar:', 'Destinatari:']: ministry_el = list(pqitems(value, 'b'))[0] question['addressee'].append(ministry_el.text()) elif label_text == 'Adresant:' or label_text == 'Adresanţi:': question['person'].append(self.person_from_td(value)) elif label_text == 'Textul intervenţiei:': link = list(pqitems(value, 'a'))[-1] assert link.text() == "fişier PDF" pdf_url = link.attr('href') if pdf_url not in pdf_url_skip: question['pdf_url'] = pdf_url question_id = '{q[date]}-{q[number]}'.format(q=question) patch = exceptions['patch'].get(question_id, {}) question.update(patch) return question
def get_question(self, href): page = self.fetch_url(href) heading = page('#pageHeader .pageHeaderLinks').text() heading_m = self.title_pattern.match(heading) assert heading_m is not None, "Could not parse heading %r" % heading question = {} question['type'] = self.types[heading_m.group('type')] question['url'] = href question['title'] = page('.headline').text() rows = iter(pqitems(page, '#pageContent > dd > table > tr')) assert (self.normalize_space(next(rows).text()) == 'Informaţii privind interpelarea') question['pdf_url'] = None question['addressee'] = [] question['method'] = None question['person'] = [] label_text = None for row in rows: norm_text = self.normalize_space(row.text()) if norm_text == '': continue elif norm_text == 'Informaţii privind răspunsul': break [label, value] = [pq(el) for el in row[0]] new_label_text = label.text() if new_label_text: label_text = new_label_text else: if label_text not in ['Adresanţi:', 'Destinatari:']: continue if label_text == 'Nr.înregistrare:': question['number'] = value.text() elif label_text == 'Data înregistrarii:': question['date'] = self.parse_date_dmy(value.text()) elif label_text == 'Mod adresare:': question['method'] = value.text() elif label_text in ['Destinatar:', 'Destinatari:']: ministry_el = list(pqitems(value, 'b'))[0] question['addressee'].append(ministry_el.text()) elif label_text == 'Adresant:' or label_text == 'Adresanţi:': question['person'].append(self.person_from_td(value)) elif label_text == 'Textul intervenţiei:': link = list(pqitems(value, 'a'))[-1] assert link.text() == "fişier PDF" pdf_url = link.attr('href') if pdf_url not in pdf_url_skip: question['pdf_url'] = pdf_url question_id = '{q[date]}-{q[number]}'.format(q=question) patch = exceptions['patch'].get(question_id, {}) question.update(patch) return question
def list_proposals(self, cam, year=None): list_url = self.list_url.format(cam=cam) if year: list_url += '&anp=%s' % year page = self.fetch_url(list_url) table = page.find('p[align=center]').next() for tr in pqitems(table, 'tr[valign=top]'): td_list = tr.find('td') link = td_list.eq(1).find('a') args = url_args(link.attr('href')) assert args.get('cam', type=int) == cam date_txt = td_list.eq(3).text() try: date = extract_modification_date(date_txt) except: logger.warn("Can't parse modification date %r" % date_txt) continue pk = args.get('idp', type=int) if (cam, pk) in [(1, 282), (1, 283)]: continue yield { 'pk': pk, 'chamber': cam, 'date': date, }
def person_from_td(self, td): for link in pqitems(td, 'a'): href = link.attr('href') if href.startswith('http://www.cdep.ro/pls/' 'parlam/structura.mp?'): (year, number) = parse_cdep_id(href) return (link.text(), year, number)
def parse_transcript_page(self, link): page = self.fetch_url(link) table_rows = pqitems(page, '#pageContent > table tr') transcript = None transcript_chapter = Chapter() def save_paragraph(): text = "\n".join(transcript.pop('text_buffer')) transcript['text'] = text transcript_chapter.paragraphs.append(transcript) for tr in table_rows: for td in pqitems(tr, 'td'): for paragraph in pqitems(td, 'p'): speakers = paragraph('b font[color="#0000FF"]') if speakers: if transcript: save_paragraph() serial = self.next_paragraph_serial() assert len(speakers) == 1 speaker_name = self.trim_name(speakers.text()) link = speakers.parents('a') if not link: transcript = None continue (year, chamber, number) = \ parse_profile_url(link.attr('href')) transcript = Transcript({ 'mandate_year': year, 'mandate_chamber': chamber, 'mandate_number': number, 'speaker_name': speaker_name, 'text_buffer': [], 'serial': serial, }) else: if transcript is None: continue text = paragraph.text() transcript['text_buffer'].append(text) if transcript: save_paragraph() return transcript_chapter
def extract_answer(self, rows): for row in list(rows): text = self.normalize_space(row.text()) if text == "Textul răspunsului: fişier PDF": value = pq(row[0][1]) link = list(pqitems(value, 'a'))[-1] assert link.text() == "fişier PDF" pdf_url = link.attr('href') return {"pdf_url": pdf_url}
def parse_steno_page(self, link): page = self.fetch_url(link) table_rows = pqitems(page, '#pageContent > table tr') steno_paragraph = None steno_chapter = StenoChapter() text_buffer = [] def save_paragraph(): text = "\n".join(steno_paragraph.pop('text_buffer')) steno_paragraph['text'] = text steno_chapter.paragraphs.append(steno_paragraph) for tr in table_rows: for td in pqitems(tr, 'td'): for paragraph in pqitems(td, 'p'): speakers = paragraph('b font[color="#0000FF"]') if speakers: if steno_paragraph: save_paragraph() assert len(speakers) == 1 speaker_name = self.trim_name(speakers.text()) link = speakers.parents('a') if link: speaker_cdep_id = get_cdep_id(link.attr('href')) else: speaker_cdep_id = None steno_paragraph = StenoParagraph({ 'speaker_cdep_id': speaker_cdep_id, 'speaker_name': speaker_name, 'text_buffer': [], 'serial': self.next_paragraph_serial() }) else: if steno_paragraph is None: continue # still looking for first speaker text = paragraph.text() steno_paragraph['text_buffer'].append(text) if steno_paragraph: save_paragraph() return steno_chapter
def fetch_mp_proposals(self, cdep_id): (leg, idm) = cdep_id url = self.mandate_proposal_url.format(leg=leg, idm=idm) page = self.fetch_url(url) headline = pqitems(page, ':contains("PL înregistrat la")') if not headline: return # no proposals here table = pq(headline[0].parents('table')[-1]) rows = iter(pqitems(table, 'tr')) assert "PL înregistrat la" in next(rows).text() assert "Camera Deputaţilor" in next(rows).text() for row in rows: cols = pqitems(row, 'td') def cdeppk(col): href = col.find('a').attr('href') or '?' val = url_decode(href.split('?', 1)[1]).get('idp') return int(val) if val else None cdeppks = (cdeppk(cols[1]), cdeppk(cols[2])) p = Proposal(*cdeppks) yield p
def fetch_people(self, year=2012): people_page = self.fetch_url(self.people_url.format(year=year)) for tr in pqitems(people_page, 'tr'): for a in pqitems(tr, 'a'): href = a.attr('href') if 'structura.mp' in href: name = a.text() cdep_id = get_cdep_id(href) td = a.parents('tr')[2] county_name = pq(td[3]).text() minority = False if county_name in ["Mino.", "Minoritati"]: county_name = None minority = True yield { 'cdep_id': cdep_id, 'name': name, 'county_name': county_name, 'minority': minority, }
def fetch_mp_proposals(self, cdep_id): (leg, idm) = cdep_id url = self.mandate_proposal_url.format(leg=leg, idm=idm) page = self.fetch_url(url) headline = pqitems(page, ':contains("PL înregistrat la")') if not headline: return # no proposals here table = pq(headline[0].parents('table')[-1]) rows = iter(pqitems(table, 'tr')) assert "PL înregistrat la" in next(rows).text() assert "Camera Deputaţilor" in next(rows).text() for row in rows: cols = pqitems(row, 'td') cdep_code = cols[1].text() senate_code = cols[2].text() combined_id = 'cdep=%s senate=%s' % (cdep_code, senate_code) link = pqitems(row, 'a')[0] url = link.attr('href') if 'cam=' not in url: assert '?' in url url = url.replace('?', '?cam=2&') yield combined_id, url
def fetch_proposal_details(self, url): page = self.fetch_url(url) out = { 'title': pq('.headline', page).text(), 'url': url, } [hook_td] = pqitems(page, ':contains("Nr. înregistrare")') metadata_table = pq(hook_td.parents('table')[-1]) for row in pqitems(metadata_table.children('tr')): cols = row.children() label = cols.eq(0).text().strip() val_td = cols.eq(1) if len(cols) > 1 else None if label == "- Camera Deputatilor:": out['cdep_serial'] = val_td.text() elif label == "Tip initiativa:": out['proposal_type'] = val_td.text() elif label == "Consultati:": for tr in pqitems(val_td, 'tr'): if tr.text() == "Forma iniţiatorului": [a] = pqitems(tr, 'a') href = a.attr('href') out['pdf_url'] = href elif label == "Camera decizionala:": txt = val_td.text() if txt == 'Camera Deputatilor': out['decision_chamber'] = 'cdep' elif txt == 'Senatul': out['decision_chamber'] = 'senat' else: logger.warn("Unknown decision_chamber %r", txt) return out
def run(self, year): index = self.fetch_url( 'http://www.cdep.ro/pls/parlam/' 'interpelari.lista?tip=&dat={year}&idl=1'.format(year=year)) for link in pqitems(index, '#pageContent table a'): href = link.attr('href') if href in url_skip: continue assert href.startswith('http://www.cdep.ro/pls/' 'parlam/interpelari.detalii') if self.skip(href): logger.debug('skipping %r', href) else: yield self.get_question(href)
def run(self, year): index = self.fetch_url('http://www.cdep.ro/pls/parlam/' 'interpelari.lista?tip=&dat={year}&idl=1' .format(year=year)) for link in pqitems(index, '#pageContent table a'): href = link.attr('href') if href in url_skip: continue assert href.startswith('http://www.cdep.ro/pls/' 'parlam/interpelari.detalii') if self.skip(href): logger.debug('skipping %r', href) else: yield self.get_question(href)
def expand_minority_names(): from mptracker.scraper.common import Scraper, get_cached_session, pqitems scraper = Scraper(get_cached_session(), use_cdep_opener=False) doc = get_minority_names() roots = doc['root_names'] names = set() for root in roots: url = ('http://dexonline.ro/definitie' '/{root}/paradigma'.format(root=root)) page = scraper.fetch_url(url) for td in pqitems(page, 'table.lexem td.form'): names.add(td.text().replace(' ', '')) if '—' in names: names.remove('—') doc['search_names'] = sorted(names) print(flask.json.dumps(doc, indent=2, sort_keys=True))
def fetch_summaries(self, year=2013, get_pdf_text=False): from collections import defaultdict for p in range(50): page_url = self.listing_page_url.format(offset=100 * p, year=year) page = self.fetch_url(page_url) i_el = list(pqitems(page, ":contains('înregistrări')"))[-1] table = list(i_el.parents('table'))[-1] empty_page = True table_rows = pqitems(pq(table), 'tr') assert "înregistrări găsite:" in next(table_rows).text() assert next(table_rows).text() == "Nr. Crt. PDF Data Titlu Comisia" for tr in table_rows: empty_page = False [pdf_link] = pqitems(tr, 'a[target=PDF]') col3 = list(pqitems(tr, 'td'))[2] date_value = datetime.strptime(col3.text(), '%d.%m.%Y').date() col4 = list(pqitems(tr, 'td'))[3] title = col4.text() col5 = list(pqitems(tr, 'td'))[4] pdf_url = pdf_link.attr('href') pdf_url_m = self.pdf_url_pattern.search(pdf_url) assert pdf_url_m is not None, "can't parse url: %r" % pdf_url committee_code = pdf_url_m.group('committee') assert committee_names[committee_code] == col5.text() row = { 'committee': committee_code, 'pdf_url': pdf_url, 'date': date_value, 'title': title, } if get_pdf_text: pdf_data = self.pdf_session.get(pdf_url).content text = pdf_to_text(pdf_data) row['text'] = text yield row if empty_page: break
def fetch_summaries(self, year=2013, get_pdf_text=False): from collections import defaultdict for p in range(50): page_url = self.listing_page_url.format(offset=100*p, year=year) page = self.fetch_url(page_url) i_el = list(pqitems(page, ":contains('înregistrări')"))[-1] table = list(i_el.parents('table'))[-1] empty_page = True table_rows = pqitems(pq(table), 'tr') assert "înregistrări găsite:" in next(table_rows).text() assert next(table_rows).text() == "Nr. Crt. PDF Data Titlu Comisia" for tr in table_rows: empty_page = False [pdf_link] = pqitems(tr, 'a[target=PDF]') col3 = list(pqitems(tr, 'td'))[2] date_value = datetime.strptime(col3.text(), '%d.%m.%Y').date() col4 = list(pqitems(tr, 'td'))[3] title = col4.text() col5 = list(pqitems(tr, 'td'))[4] pdf_url = pdf_link.attr('href') pdf_url_m = self.pdf_url_pattern.search(pdf_url) assert pdf_url_m is not None, "can't parse url: %r" % pdf_url committee_code = pdf_url_m.group('committee') assert committee_names[committee_code] == col5.text() row = { 'committee': committee_code, 'pdf_url': pdf_url, 'date': date_value, 'title': title, } if get_pdf_text: pdf_data = self.pdf_session.get(pdf_url).content text = pdf_to_text(pdf_data) row['text'] = text yield row if empty_page: break
def fetch_proposal_details(self, prop): page = self.fetch_url(prop.url) page_cdep = page_senate = None if prop.url_cdep: page_cdep = self.fetch_url(prop.url_cdep) if prop.url_senate: page_senate = self.fetch_url(prop.url_senate) page = page_cdep or page_senate prop.title = pq('.headline', page).text() prop.number_bpi = None prop.number_cdep = None prop.number_senate = None prop.decision_chamber = None prop.pdf_url = None prop.status = None prop.status_text = None [hook_td] = pqitems(page, ':contains("Nr. înregistrare")') metadata_table = pq(hook_td.parents('table')[-1]) for row in pqitems(metadata_table.children('tr')): cols = row.children() label = cols.eq(0).text().strip() val_td = cols.eq(1) if len(cols) > 1 else None if label == "- B.P.I.:": prop.number_bpi = val_td.text() elif label == "- Camera Deputatilor:": prop.number_cdep = val_td.text() elif label == "- Senat:": prop.number_senate = val_td.text() elif label == "Tip initiativa:": prop.proposal_type = val_td.text() elif label == "Consultati:": for tr in pqitems(val_td, 'tr'): if tr.text() == "Forma iniţiatorului": [a] = pqitems(tr, 'a') href = a.attr('href') prop.pdf_url = href elif label == "Camera decizionala:": txt = val_td.text() if txt == 'Camera Deputatilor': prop.decision_chamber = 'cdep' elif txt == 'Senatul': prop.decision_chamber = 'senat' elif txt == 'Camera Deputatilor + Senatul': prop.decision_chamber = 'common' else: logger.warn("Unknown decision_chamber %r", txt) elif label == "Stadiu:": prop.status_text = val_td.text() prop.status = self.classify_status(prop.status_text) prop.date = get_date_from_numbers( [prop.number_bpi, prop.number_cdep, prop.number_senate]) assert prop.date is not None, "No date for proposal %r" % prop.url cdep_activity = (self.get_activity(page_cdep) if page_cdep else []) senate_activity = (self.get_activity(page_senate) if page_senate else []) prop.activity = self.merge_activity(cdep_activity, senate_activity)
def scrape_proposal_page(self, chamber, pk): rv = {} url = ( 'http://www.cdep.ro/pls/proiecte/upl_pck.proiect?idp=%d&cam=%d' % (pk, chamber) ) page = self.fetch_url(url) if chamber == 1: rv['pk_senate'] = pk else: rv['pk_cdep'] = pk rv['title'] = pq('.headline', page).text() rv['sponsorship'] = [] [hook_td] = pqitems(page, ':contains("Nr. înregistrare")') metadata_table = pq(hook_td.parents('table')[-1]) date_texts = [] for row in pqitems(metadata_table.children('tr')): cols = row.children() label = cols.eq(0).text().strip() val_td = cols.eq(1) if len(cols) > 1 else None if label == "- B.P.I.:": txt = val_td.text() rv['number_bpi'] = ' '.join( parse_proposal_number(t)[1] for t in txt.split() ) date_texts.append(txt.split()[0]) elif label == "- Camera Deputatilor:": txt = val_td.text() rv['number_cdep'] = parse_proposal_number(txt)[1] date_texts.append(txt) link = val_td.find('a') if link: args = url_args(link.attr('href')) assert args.get('cam', '2') == '2' rv['pk_cdep'] = args.get('idp', type=int) elif label == "- Senat:": txt = val_td.text() rv['number_senate'] = parse_proposal_number(txt)[1] date_texts.append(txt) link = val_td.find('a') if link: args = url_args(link.attr('href')) assert args.get('cam') == '1' rv['pk_senate'] = args.get('idp', type=int) elif label == "Tip initiativa:": rv['proposal_type'] = val_td.text() elif label == "Consultati:": for tr in pqitems(val_td, 'tr'): if tr.text() == "Forma iniţiatorului": [a] = pqitems(tr, 'a') href = a.attr('href') rv['pdf_url'] = href elif label == "Camera decizionala:": txt = val_td.text() if txt == 'Camera Deputatilor': rv['decision_chamber'] = 'cdep' elif txt == 'Senatul': rv['decision_chamber'] = 'senat' elif txt == 'Camera Deputatilor + Senatul': rv['decision_chamber'] = 'common' elif txt == '-': rv['decision_chamber'] = None else: logger.warn("Unknown decision_chamber %r", txt) elif label == "Stadiu:": rv['status_text'] = val_td.text() elif label == "Initiator:": for link in pqitems(val_td, 'a'): args = url_args(link.attr('href')) if args.get('cam', 2, type=int) == 2: cdep_id = ( args.get('leg', type=int), args.get('idm', type=int), ) rv['sponsorship'].append(cdep_id) rv['activity'] = self.get_activity(page) rv['date'] = get_date_from_numbers(date_texts) if rv['date'] is None: rv['date'] = rv['activity'][0].date return rv
def fetch_proposal_details(self, prop): page = self.fetch_url(prop.url) page_cdep = page_senate = None if prop.url_cdep: page_cdep = self.fetch_url(prop.url_cdep) if prop.url_senate: page_senate = self.fetch_url(prop.url_senate) page = page_cdep or page_senate prop.title = pq(".headline", page).text() prop.number_bpi = None prop.number_cdep = None prop.number_senate = None prop.decision_chamber = None prop.pdf_url = None prop.status = None prop.status_text = None [hook_td] = pqitems(page, ':contains("Nr. înregistrare")') metadata_table = pq(hook_td.parents("table")[-1]) for row in pqitems(metadata_table.children("tr")): cols = row.children() label = cols.eq(0).text().strip() val_td = cols.eq(1) if len(cols) > 1 else None if label == "- B.P.I.:": prop.number_bpi = val_td.text() elif label == "- Camera Deputatilor:": prop.number_cdep = val_td.text() elif label == "- Senat:": prop.number_senate = val_td.text() elif label == "Tip initiativa:": prop.proposal_type = val_td.text() elif label == "Consultati:": for tr in pqitems(val_td, "tr"): if tr.text() == "Forma iniţiatorului": [a] = pqitems(tr, "a") href = a.attr("href") prop.pdf_url = href elif label == "Camera decizionala:": txt = val_td.text() if txt == "Camera Deputatilor": prop.decision_chamber = "cdep" elif txt == "Senatul": prop.decision_chamber = "senat" elif txt == "Camera Deputatilor + Senatul": prop.decision_chamber = "common" else: logger.warn("Unknown decision_chamber %r", txt) elif label == "Stadiu:": prop.status_text = val_td.text() prop.status = self.classify_status(prop.status_text) prop.date = get_date_from_numbers([prop.number_bpi, prop.number_cdep, prop.number_senate]) assert prop.date is not None, "No date for proposal %r" % prop.url cdep_activity = self.get_activity(page_cdep) if page_cdep else [] senate_activity = self.get_activity(page_senate) if page_senate else [] prop.activity = self.merge_activity(cdep_activity, senate_activity)