def test_correlate_cdep_senate(session): from mptracker.scraper.proposals import ProposalScraper session.url_map.update( { LISTING_URL % (65, 2012): PAGES_DIR / "proposal-listing-2012-65", PROPOSAL_URL + "idp=17113&cam=1": PAGES_DIR / "proposal-1-17113", PROPOSAL_URL + "idp=13330&cam=2": PAGES_DIR / "proposal-2-13330", PROPOSAL_URL + "idp=13526&cam=2": PAGES_DIR / "proposal-2-13526", PROPOSAL_URL + "idp=17422&cam=1": PAGES_DIR / "proposal-1-17422", PROPOSAL_URL + "idp=17334&cam=1": PAGES_DIR / "proposal-1-17334", } ) scraper = ProposalScraper(session) proposals = scraper.fetch_from_mp_pages([(2012, 65)]) assert len(proposals) == 4 proposals.sort(key=lambda p: p.title) pr = proposals[0] assert pr.title == ( "BP327/2013 Propunere legislativă privind " "facilitățile acordate șomerilor pentru " "transportul intern" ) assert pr.cdeppk_cdep == 13330 assert pr.cdeppk_senate == 17334
def test_correlate_cdep_senate(session): from mptracker.scraper.proposals import ProposalScraper session.url_map.update({ LISTING_URL % (65, 2012): PAGES_DIR / 'proposal-listing-2012-65', PROPOSAL_URL + 'idp=17113&cam=1': PAGES_DIR / 'proposal-1-17113', PROPOSAL_URL + 'idp=13330&cam=2': PAGES_DIR / 'proposal-2-13330', PROPOSAL_URL + 'idp=13526&cam=2': PAGES_DIR / 'proposal-2-13526', PROPOSAL_URL + 'idp=17422&cam=1': PAGES_DIR / 'proposal-1-17422', PROPOSAL_URL + 'idp=17334&cam=1': PAGES_DIR / 'proposal-1-17334', }) scraper = ProposalScraper(session) proposals = scraper.fetch_from_mp_pages([(2012, 65)]) assert len(proposals) == 4 proposals.sort(key=lambda p: p.title) pr = proposals[0] assert pr.title == ('BP327/2013 Propunere legislativă privind ' 'facilitățile acordate șomerilor pentru ' 'transportul intern') assert pr.cdeppk_cdep == 13330 assert pr.cdeppk_senate == 17334
def test_simple_scraping(session): from mptracker.scraper.proposals import ProposalScraper session.url_map.update({ LISTING_URL % (126, 2012): PAGES_DIR / 'proposal-listing-2012-126', PROPOSAL_URL + 'idp=17135&cam=1': PAGES_DIR / 'proposal-1-17135', PROPOSAL_URL + 'idp=13348&cam=2': PAGES_DIR / 'proposal-2-13348', PROPOSAL_URL + 'idp=17422&cam=1': PAGES_DIR / 'proposal-1-17422', PROPOSAL_URL + 'idp=17343&cam=1': PAGES_DIR / 'proposal-1-17343', }) scraper = ProposalScraper(session) proposals = scraper.fetch_from_mp_pages([(2012, 126)]) assert len(proposals) == 3 proposals.sort(key=lambda p: p.title) pr = proposals[0] assert pr.sponsorships == [(2012, 126)] assert pr.number_bpi == '346/04-06-2013' assert pr.number_cdep == 'BP346/04.06.2013' assert pr.number_senate == 'L430/03.09.2013' assert pr.decision_chamber == 'cdep' assert pr.proposal_type == 'Propunere legislativa' assert pr.pdf_url == ('http://www.cdep.ro/proiecte/bp/' '2013/300/40/6/pl346.pdf') assert "declararea zilei de 10 mai" in pr.title assert pr.url == ('http://www.cdep.ro/pls/proiecte/upl_pck.proiect' '?idp=13348&cam=2')
def test_simple_scraping(session): from mptracker.scraper.proposals import ProposalScraper session.url_map.update( { LISTING_URL % (126, 2012): PAGES_DIR / "proposal-listing-2012-126", PROPOSAL_URL + "idp=17135&cam=1": PAGES_DIR / "proposal-1-17135", PROPOSAL_URL + "idp=13348&cam=2": PAGES_DIR / "proposal-2-13348", PROPOSAL_URL + "idp=17422&cam=1": PAGES_DIR / "proposal-1-17422", PROPOSAL_URL + "idp=17343&cam=1": PAGES_DIR / "proposal-1-17343", } ) scraper = ProposalScraper(session) proposals = scraper.fetch_from_mp_pages([(2012, 126)]) assert len(proposals) == 3 proposals.sort(key=lambda p: p.title) pr = proposals[0] assert pr.sponsorships == [(2012, 126)] assert pr.number_bpi == "346/04-06-2013" assert pr.number_cdep == "BP346/04.06.2013" assert pr.number_senate == "L430/03.09.2013" assert pr.decision_chamber == "cdep" assert pr.proposal_type == "Propunere legislativa" assert pr.pdf_url == ("http://www.cdep.ro/proiecte/bp/" "2013/300/40/6/pl346.pdf") assert "declararea zilei de 10 mai" in pr.title assert pr.url == ("http://www.cdep.ro/pls/proiecte/upl_pck.proiect" "?idp=13348&cam=2")
def get_proposal_single_page( chamber, pk, cache_name=None, ): import pickle import yaml import os from mptracker.scraper.proposals import ProposalScraper SCRAPER_PACKAGE = path(__file__).abspath().parent PROJECT_ROOT = SCRAPER_PACKAGE.parent.parent session = create_session(cache_name=cache_name or _get_config_cache_name()) scraper = ProposalScraper(session) pk = int(pk) chamber = int(chamber) record = { 'pk': pk, 'chamber': chamber, 'date': date.today(), } logger.info("scraping %d %d", chamber, pk) result = scraper.scrape_proposal_page(chamber, pk) keylist = ["date", "html", "location"] filename = PROJECT_ROOT / "proposals" / "proposal_{pk_p}_{chamber_p}.yml".format( pk_p = pk, chamber_p = chamber) # ref or not activity_list = result['activity'].copy() if 'activity' in result: del result['activity'] dict_list = [k.as_dict(keylist) for k in activity_list] result['activity'] = dict_list if not os.path.exists(PROJECT_ROOT / "proposals"): os.makedirs(PROJECT_ROOT / "proposals") outfile = open(filename, "w") outfile.write(yaml.dump(result, default_flow_style=True))
def get_proposal_pages( throttle=None, cache_name=None, year=None, ): from itertools import chain from mptracker.scraper.proposals import ProposalScraper session = create_session( cache_name=cache_name or _get_config_cache_name(), throttle=float(throttle) if throttle else None, ) scraper = ProposalScraper(session) for record in chain(scraper.list_proposals(2, year), scraper.list_proposals(1, year)): get_proposal_single_page(record['chamber'], record['pk'], cache_name)
def test_get_activity(session): from mptracker.scraper.proposals import ProposalScraper PROP_URL = "http://www.cdep.ro/pls/proiecte/upl_pck.proiect?idp=13037" session.url_map.update({PROP_URL: PAGES_DIR / "proposal-2-13037"}) scraper = ProposalScraper(session) page = scraper.fetch_url(PROP_URL) activity = scraper.get_activity(page) assert "prezentare în Biroul Permanent" in activity[0].html assert activity[0].location == "CD" assert activity[0].date == date(2013, 2, 11) assert activity[3].date == date(2013, 6, 5) assert "la Camera Deputaţilor pentru dezbatere" in activity[3].html assert "trimis pentru aviz la" in activity[3].html assert activity[4].date == date(2013, 6, 13) assert activity[-1].date == date(2013, 6, 25) assert "primire aviz de la" in activity[-1].html assert "Comisia pentru sănătate şi familie" in activity[-1].html assert "(pdf)" in activity[-1].html
def test_get_activity(session): from mptracker.scraper.proposals import ProposalScraper PROP_URL = 'http://www.cdep.ro/pls/proiecte/upl_pck.proiect?idp=13037' session.url_map.update({ PROP_URL: PAGES_DIR / 'proposal-2-13037', }) scraper = ProposalScraper(session) page = scraper.fetch_url(PROP_URL) activity = scraper.get_activity(page) assert "prezentare în Biroul Permanent" in activity[0].html assert activity[0].location == 'CD' assert activity[0].date == date(2013, 2, 11) assert activity[3].date == date(2013, 6, 5) assert "la Camera Deputaţilor pentru dezbatere" in activity[3].html assert "trimis pentru aviz la" in activity[3].html assert activity[4].date == date(2013, 6, 13) assert activity[-1].date == date(2013, 6, 25) assert "primire aviz de la" in activity[-1].html assert "Comisia pentru sănătate şi familie" in activity[-1].html assert '(pdf)' in activity[-1].html
def test_merge_activity(session): from mptracker.scraper.proposals import ProposalScraper PROP_URL_CDEP = PROPOSAL_URL + 'idp=13037&cam=2' PROP_URL_SENATE = PROPOSAL_URL + 'idp=17003&cam=1' session.url_map.update({ PROP_URL_CDEP: PAGES_DIR / 'proposal-2-13037', PROP_URL_SENATE: PAGES_DIR / 'proposal-1-17003', }) scraper = ProposalScraper(session) activity = scraper.merge_activity( scraper.get_activity(scraper.fetch_url(PROP_URL_CDEP)), scraper.get_activity(scraper.fetch_url(PROP_URL_SENATE))) assert activity[3].date == date(2013, 2, 12) assert "înregistrat la Senat pentru dezbatere" in activity[3].html assert "cu nr.b38 (adresa nr.bpi19/11-02-2013)" in activity[3].html assert activity[4].date == date(2013, 2, 19) assert "trimis pentru aviz la Consiliul legislativ" in activity[4].html
def test_merge_activity(session): from mptracker.scraper.proposals import ProposalScraper PROP_URL_CDEP = PROPOSAL_URL + "idp=13037&cam=2" PROP_URL_SENATE = PROPOSAL_URL + "idp=17003&cam=1" session.url_map.update( {PROP_URL_CDEP: PAGES_DIR / "proposal-2-13037", PROP_URL_SENATE: PAGES_DIR / "proposal-1-17003"} ) scraper = ProposalScraper(session) activity = scraper.merge_activity( scraper.get_activity(scraper.fetch_url(PROP_URL_CDEP)), scraper.get_activity(scraper.fetch_url(PROP_URL_SENATE)) ) assert activity[3].date == date(2013, 2, 12) assert "înregistrat la Senat pentru dezbatere" in activity[3].html assert "cu nr.b38 (adresa nr.bpi19/11-02-2013)" in activity[3].html assert activity[4].date == date(2013, 2, 19) assert "trimis pentru aviz la Consiliul legislativ" in activity[4].html
def proposals( cache_name=None, throttle=None, autoanalyze=False, ): from mptracker.scraper.proposals import ProposalScraper from mptracker.proposals import ocr_proposal from mptracker.policy import calculate_proposal proposal_scraper = ProposalScraper(create_session( cache_name=cache_name, throttle=float(throttle) if throttle else None)) def cdep_id(mandate): return (mandate.year, mandate.cdep_number) by_cdep_id = {cdep_id(m): m for m in models.Mandate.query if m.year == 2012} id_cdeppk_cdep = {} id_cdeppk_senate = {} for proposal in models.Proposal.query: if proposal.cdeppk_cdep: id_cdeppk_cdep[proposal.cdeppk_cdep] = proposal.id if proposal.cdeppk_senate: id_cdeppk_senate[proposal.cdeppk_senate] = proposal.id chamber_by_slug = {c.slug: c for c in models.Chamber.query} proposals = proposal_scraper.fetch_from_mp_pages(set(by_cdep_id.keys())) all_activity = defaultdict(list) for item in models.ProposalActivityItem.query: all_activity[item.proposal_id].append(item) proposal_patcher = TablePatcher(models.Proposal, models.db.session, key_columns=['id']) activity_patcher = TablePatcher(models.ProposalActivityItem, models.db.session, key_columns=['id']) sp_updates = sp_added = sp_removed = 0 changed = [] seen = [] with proposal_patcher.process(autoflush=1000, remove=True) as add_proposal: with activity_patcher.process(autoflush=1000, remove=True) \ as add_activity: for prop in proposals: record = model_to_dict(prop, ['cdeppk_cdep', 'cdeppk_senate', 'decision_chamber', 'url', 'title', 'date', 'number_bpi', 'number_cdep', 'number_senate', 'proposal_type', 'pdf_url']) slug = prop.decision_chamber if slug: record['decision_chamber'] = chamber_by_slug[slug] idc = id_cdeppk_cdep.get(prop.cdeppk_cdep) ids = id_cdeppk_senate.get(prop.cdeppk_senate) if idc and ids and idc != ids: logger.warn("Two different records for the same proposal: " "(%s, %s). Removing the 2nd.", idc, ids) models.db.session.delete(models.Proposal.query.get(ids)) ids = None record['id'] = idc or ids or models.random_uuid() result = add_proposal(record) row = result.row if result.is_changed: changed.append(row) seen.append(row) new_people = set(by_cdep_id[ci] for ci in prop.sponsorships) existing_sponsorships = {sp.mandate: sp for sp in row.sponsorships} to_remove = set(existing_sponsorships) - set(new_people) to_add = set(new_people) - set(existing_sponsorships) if to_remove: logger.info("Removing sponsors %s: %r", row.id, [cdep_id(m) for m in to_remove]) sp_removed += 1 for m in to_remove: sp = existing_sponsorships[m] models.db.session.delete(sp) if to_add: logger.info("Adding sponsors %s: %r", row.id, [cdep_id(m) for m in to_add]) sp_added += 1 for m in to_add: row.sponsorships.append(models.Sponsorship(mandate=m)) if to_remove or to_add: sp_updates += 1 db_activity = all_activity[row.id] db_activity.sort(key=lambda a: a.order) act_fields = lambda r: (r.date, r.location) if ([act_fields(r) for r in db_activity] != [act_fields(r) for r in prop.activity[:len(db_activity)]]): logger.warn("History doesn't match for %s, " "%d items will be removed", row.id,len(db_activity)) db_activity = [] for n, ac in enumerate(prop.activity): record = model_to_dict(ac, ['date', 'location', 'html']) record['proposal_id'] = row.id record['order'] = n if n < len(db_activity): item = db_activity[n] record['id'] = item.id assert item.date == record['date'] assert item.location == record['location'] assert item.order == record['order'] else: record['id'] = models.random_uuid() add_activity(record) models.db.session.commit() logger.info("Updated sponsorship for %d proposals (+%d, -%d)", sp_updates, sp_added, sp_removed) if autoanalyze: logger.info("Scheduling analysis jobs for %d proposals", len(changed)) for proposal in changed: if proposal.pdf_url: ocr_proposal.delay(proposal.id, autoanalyze=True) logger.info("Scheduling policy jobs for %d proposals", len(seen)) for proposal in seen: if proposal.policy_domain_id is None: calculate_proposal.delay(proposal.id)
def proposals(dry_run=False): from mptracker.scraper.proposals import ProposalScraper proposal_scraper = ProposalScraper(create_session(cache_name='page-cache', throttle=0.5)) def cdep_id(mandate): return (mandate.year, mandate.cdep_number) by_cdep_id = {cdep_id(m): m for m in models.Mandate.query if m.year == 2012} chamber_by_slug = {c.slug: c for c in models.Chamber.query} proposals = proposal_scraper.fetch_from_mp_pages(set(by_cdep_id.keys())) proposal_patcher = TablePatcher(models.Proposal, models.db.session, key_columns=['combined_id']) sp_updates = sp_added = sp_removed = 0 with proposal_patcher.process(autoflush=1000, remove=True) as add: for record in proposals: if 'decision_chamber' in record: slug = record.pop('decision_chamber') record['decision_chamber'] = chamber_by_slug[slug] sponsorships = record.pop('_sponsorships') url = record['url'] result = add(record) row = result.row new_people = set(by_cdep_id[ci] for ci in sponsorships) existing_sponsorships = {sp.mandate: sp for sp in row.sponsorships} to_remove = set(existing_sponsorships) - set(new_people) to_add = set(new_people) - set(existing_sponsorships) if to_remove: logger.info("Removing sponsors %s: %r", row.combined_id, [cdep_id(m) for m in to_remove]) sp_removed += 1 for m in to_remove: sp = existing_sponsorships[m] models.db.session.delete(sp) if to_add: logger.info("Adding sponsors %s: %r", row.combined_id, [cdep_id(m) for m in to_add]) sp_added += 1 for m in to_add: row.sponsorships.append(models.Sponsorship(mandate=m)) if to_remove or to_add: sp_updates += 1 if dry_run: models.db.session.rollback() logger.info("Updated sponsorship for %d proposals (+%d, -%d)", sp_updates, sp_added, sp_removed)