Ejemplo n.º 1
0
def test_correlate_cdep_senate(session):
    from mptracker.scraper.proposals import ProposalScraper

    session.url_map.update(
        {
            LISTING_URL % (65, 2012): PAGES_DIR / "proposal-listing-2012-65",
            PROPOSAL_URL + "idp=17113&cam=1": PAGES_DIR / "proposal-1-17113",
            PROPOSAL_URL + "idp=13330&cam=2": PAGES_DIR / "proposal-2-13330",
            PROPOSAL_URL + "idp=13526&cam=2": PAGES_DIR / "proposal-2-13526",
            PROPOSAL_URL + "idp=17422&cam=1": PAGES_DIR / "proposal-1-17422",
            PROPOSAL_URL + "idp=17334&cam=1": PAGES_DIR / "proposal-1-17334",
        }
    )

    scraper = ProposalScraper(session)
    proposals = scraper.fetch_from_mp_pages([(2012, 65)])

    assert len(proposals) == 4
    proposals.sort(key=lambda p: p.title)
    pr = proposals[0]
    assert pr.title == (
        "BP327/2013 Propunere legislativă privind " "facilitățile acordate șomerilor pentru " "transportul intern"
    )
    assert pr.cdeppk_cdep == 13330
    assert pr.cdeppk_senate == 17334
Ejemplo n.º 2
0
def test_correlate_cdep_senate(session):
    from mptracker.scraper.proposals import ProposalScraper

    session.url_map.update({
        LISTING_URL % (65, 2012):
        PAGES_DIR / 'proposal-listing-2012-65',
        PROPOSAL_URL + 'idp=17113&cam=1':
        PAGES_DIR / 'proposal-1-17113',
        PROPOSAL_URL + 'idp=13330&cam=2':
        PAGES_DIR / 'proposal-2-13330',
        PROPOSAL_URL + 'idp=13526&cam=2':
        PAGES_DIR / 'proposal-2-13526',
        PROPOSAL_URL + 'idp=17422&cam=1':
        PAGES_DIR / 'proposal-1-17422',
        PROPOSAL_URL + 'idp=17334&cam=1':
        PAGES_DIR / 'proposal-1-17334',
    })

    scraper = ProposalScraper(session)
    proposals = scraper.fetch_from_mp_pages([(2012, 65)])

    assert len(proposals) == 4
    proposals.sort(key=lambda p: p.title)
    pr = proposals[0]
    assert pr.title == ('BP327/2013 Propunere legislativă privind '
                        'facilitățile acordate șomerilor pentru '
                        'transportul intern')
    assert pr.cdeppk_cdep == 13330
    assert pr.cdeppk_senate == 17334
Ejemplo n.º 3
0
def test_simple_scraping(session):
    from mptracker.scraper.proposals import ProposalScraper

    session.url_map.update({
        LISTING_URL % (126, 2012):
        PAGES_DIR / 'proposal-listing-2012-126',
        PROPOSAL_URL + 'idp=17135&cam=1':
        PAGES_DIR / 'proposal-1-17135',
        PROPOSAL_URL + 'idp=13348&cam=2':
        PAGES_DIR / 'proposal-2-13348',
        PROPOSAL_URL + 'idp=17422&cam=1':
        PAGES_DIR / 'proposal-1-17422',
        PROPOSAL_URL + 'idp=17343&cam=1':
        PAGES_DIR / 'proposal-1-17343',
    })

    scraper = ProposalScraper(session)
    proposals = scraper.fetch_from_mp_pages([(2012, 126)])

    assert len(proposals) == 3
    proposals.sort(key=lambda p: p.title)
    pr = proposals[0]
    assert pr.sponsorships == [(2012, 126)]
    assert pr.number_bpi == '346/04-06-2013'
    assert pr.number_cdep == 'BP346/04.06.2013'
    assert pr.number_senate == 'L430/03.09.2013'
    assert pr.decision_chamber == 'cdep'
    assert pr.proposal_type == 'Propunere legislativa'
    assert pr.pdf_url == ('http://www.cdep.ro/proiecte/bp/'
                          '2013/300/40/6/pl346.pdf')
    assert "declararea zilei de 10 mai" in pr.title
    assert pr.url == ('http://www.cdep.ro/pls/proiecte/upl_pck.proiect'
                      '?idp=13348&cam=2')
Ejemplo n.º 4
0
def test_simple_scraping(session):
    from mptracker.scraper.proposals import ProposalScraper

    session.url_map.update(
        {
            LISTING_URL % (126, 2012): PAGES_DIR / "proposal-listing-2012-126",
            PROPOSAL_URL + "idp=17135&cam=1": PAGES_DIR / "proposal-1-17135",
            PROPOSAL_URL + "idp=13348&cam=2": PAGES_DIR / "proposal-2-13348",
            PROPOSAL_URL + "idp=17422&cam=1": PAGES_DIR / "proposal-1-17422",
            PROPOSAL_URL + "idp=17343&cam=1": PAGES_DIR / "proposal-1-17343",
        }
    )

    scraper = ProposalScraper(session)
    proposals = scraper.fetch_from_mp_pages([(2012, 126)])

    assert len(proposals) == 3
    proposals.sort(key=lambda p: p.title)
    pr = proposals[0]
    assert pr.sponsorships == [(2012, 126)]
    assert pr.number_bpi == "346/04-06-2013"
    assert pr.number_cdep == "BP346/04.06.2013"
    assert pr.number_senate == "L430/03.09.2013"
    assert pr.decision_chamber == "cdep"
    assert pr.proposal_type == "Propunere legislativa"
    assert pr.pdf_url == ("http://www.cdep.ro/proiecte/bp/" "2013/300/40/6/pl346.pdf")
    assert "declararea zilei de 10 mai" in pr.title
    assert pr.url == ("http://www.cdep.ro/pls/proiecte/upl_pck.proiect" "?idp=13348&cam=2")
Ejemplo n.º 5
0
def get_proposal_single_page(
        chamber,
        pk,
        cache_name=None,
    ):
    import pickle
    import yaml
    import os
    from mptracker.scraper.proposals import ProposalScraper

    SCRAPER_PACKAGE = path(__file__).abspath().parent
    PROJECT_ROOT = SCRAPER_PACKAGE.parent.parent

    session = create_session(cache_name=cache_name or _get_config_cache_name())
    scraper = ProposalScraper(session)

    pk = int(pk)
    chamber = int(chamber)

    record = {
        'pk': pk,
        'chamber': chamber,
        'date': date.today(),
    }


    logger.info("scraping %d %d", chamber, pk)
    result = scraper.scrape_proposal_page(chamber, pk)

    keylist = ["date", "html", "location"]
    filename = PROJECT_ROOT / "proposals" / "proposal_{pk_p}_{chamber_p}.yml".format(
            pk_p = pk, chamber_p = chamber)

    # ref or not
    activity_list = result['activity'].copy()
    if 'activity' in result:
        del result['activity']

    dict_list = [k.as_dict(keylist) for k in activity_list]

    result['activity'] = dict_list

    if not os.path.exists(PROJECT_ROOT / "proposals"):
        os.makedirs(PROJECT_ROOT / "proposals")

    outfile = open(filename, "w")
    outfile.write(yaml.dump(result, default_flow_style=True))
Ejemplo n.º 6
0
def get_proposal_pages(
        throttle=None,
        cache_name=None,
        year=None,
        ):
    from itertools import chain
    from mptracker.scraper.proposals import ProposalScraper

    session = create_session(
        cache_name=cache_name or _get_config_cache_name(),
        throttle=float(throttle) if throttle else None,
    )
    scraper = ProposalScraper(session)


    for record in chain(scraper.list_proposals(2, year),
                        scraper.list_proposals(1, year)):


        get_proposal_single_page(record['chamber'], record['pk'], cache_name)
Ejemplo n.º 7
0
def test_get_activity(session):
    from mptracker.scraper.proposals import ProposalScraper

    PROP_URL = "http://www.cdep.ro/pls/proiecte/upl_pck.proiect?idp=13037"

    session.url_map.update({PROP_URL: PAGES_DIR / "proposal-2-13037"})

    scraper = ProposalScraper(session)
    page = scraper.fetch_url(PROP_URL)
    activity = scraper.get_activity(page)
    assert "prezentare în Biroul Permanent" in activity[0].html
    assert activity[0].location == "CD"
    assert activity[0].date == date(2013, 2, 11)
    assert activity[3].date == date(2013, 6, 5)
    assert "la Camera Deputaţilor pentru dezbatere" in activity[3].html
    assert "trimis pentru aviz la" in activity[3].html
    assert activity[4].date == date(2013, 6, 13)
    assert activity[-1].date == date(2013, 6, 25)
    assert "primire aviz de la" in activity[-1].html
    assert "Comisia pentru sănătate şi familie" in activity[-1].html
    assert "(pdf)" in activity[-1].html
Ejemplo n.º 8
0
def test_get_activity(session):
    from mptracker.scraper.proposals import ProposalScraper
    PROP_URL = 'http://www.cdep.ro/pls/proiecte/upl_pck.proiect?idp=13037'

    session.url_map.update({
        PROP_URL: PAGES_DIR / 'proposal-2-13037',
    })

    scraper = ProposalScraper(session)
    page = scraper.fetch_url(PROP_URL)
    activity = scraper.get_activity(page)
    assert "prezentare în Biroul Permanent" in activity[0].html
    assert activity[0].location == 'CD'
    assert activity[0].date == date(2013, 2, 11)
    assert activity[3].date == date(2013, 6, 5)
    assert "la Camera Deputaţilor pentru dezbatere" in activity[3].html
    assert "trimis pentru aviz la" in activity[3].html
    assert activity[4].date == date(2013, 6, 13)
    assert activity[-1].date == date(2013, 6, 25)
    assert "primire aviz de la" in activity[-1].html
    assert "Comisia pentru sănătate şi familie" in activity[-1].html
    assert '(pdf)' in activity[-1].html
Ejemplo n.º 9
0
def test_merge_activity(session):
    from mptracker.scraper.proposals import ProposalScraper
    PROP_URL_CDEP = PROPOSAL_URL + 'idp=13037&cam=2'
    PROP_URL_SENATE = PROPOSAL_URL + 'idp=17003&cam=1'

    session.url_map.update({
        PROP_URL_CDEP: PAGES_DIR / 'proposal-2-13037',
        PROP_URL_SENATE: PAGES_DIR / 'proposal-1-17003',
    })

    scraper = ProposalScraper(session)
    activity = scraper.merge_activity(
        scraper.get_activity(scraper.fetch_url(PROP_URL_CDEP)),
        scraper.get_activity(scraper.fetch_url(PROP_URL_SENATE)))
    assert activity[3].date == date(2013, 2, 12)
    assert "înregistrat la Senat pentru dezbatere" in activity[3].html
    assert "cu nr.b38 (adresa nr.bpi19/11-02-2013)" in activity[3].html
    assert activity[4].date == date(2013, 2, 19)
    assert "trimis pentru aviz la Consiliul legislativ" in activity[4].html
Ejemplo n.º 10
0
def test_merge_activity(session):
    from mptracker.scraper.proposals import ProposalScraper

    PROP_URL_CDEP = PROPOSAL_URL + "idp=13037&cam=2"
    PROP_URL_SENATE = PROPOSAL_URL + "idp=17003&cam=1"

    session.url_map.update(
        {PROP_URL_CDEP: PAGES_DIR / "proposal-2-13037", PROP_URL_SENATE: PAGES_DIR / "proposal-1-17003"}
    )

    scraper = ProposalScraper(session)
    activity = scraper.merge_activity(
        scraper.get_activity(scraper.fetch_url(PROP_URL_CDEP)), scraper.get_activity(scraper.fetch_url(PROP_URL_SENATE))
    )
    assert activity[3].date == date(2013, 2, 12)
    assert "înregistrat la Senat pentru dezbatere" in activity[3].html
    assert "cu nr.b38 (adresa nr.bpi19/11-02-2013)" in activity[3].html
    assert activity[4].date == date(2013, 2, 19)
    assert "trimis pentru aviz la Consiliul legislativ" in activity[4].html
Ejemplo n.º 11
0
def proposals(
        cache_name=None,
        throttle=None,
        autoanalyze=False,
        ):
    from mptracker.scraper.proposals import ProposalScraper
    from mptracker.proposals import ocr_proposal
    from mptracker.policy import calculate_proposal

    proposal_scraper = ProposalScraper(create_session(
            cache_name=cache_name,
            throttle=float(throttle) if throttle else None))

    def cdep_id(mandate):
        return (mandate.year, mandate.cdep_number)

    by_cdep_id = {cdep_id(m): m
                  for m in models.Mandate.query
                  if m.year == 2012}

    id_cdeppk_cdep = {}
    id_cdeppk_senate = {}
    for proposal in models.Proposal.query:
        if proposal.cdeppk_cdep:
            id_cdeppk_cdep[proposal.cdeppk_cdep] = proposal.id
        if proposal.cdeppk_senate:
            id_cdeppk_senate[proposal.cdeppk_senate] = proposal.id

    chamber_by_slug = {c.slug: c for c in models.Chamber.query}

    proposals = proposal_scraper.fetch_from_mp_pages(set(by_cdep_id.keys()))

    all_activity = defaultdict(list)
    for item in models.ProposalActivityItem.query:
        all_activity[item.proposal_id].append(item)

    proposal_patcher = TablePatcher(models.Proposal,
                                    models.db.session,
                                    key_columns=['id'])

    activity_patcher = TablePatcher(models.ProposalActivityItem,
                                    models.db.session,
                                    key_columns=['id'])

    sp_updates = sp_added = sp_removed = 0

    changed = []
    seen = []

    with proposal_patcher.process(autoflush=1000, remove=True) as add_proposal:
        with activity_patcher.process(autoflush=1000, remove=True) \
                as add_activity:
            for prop in proposals:
                record = model_to_dict(prop, ['cdeppk_cdep', 'cdeppk_senate',
                    'decision_chamber', 'url', 'title', 'date', 'number_bpi',
                    'number_cdep', 'number_senate', 'proposal_type',
                    'pdf_url'])

                slug = prop.decision_chamber
                if slug:
                    record['decision_chamber'] = chamber_by_slug[slug]

                idc = id_cdeppk_cdep.get(prop.cdeppk_cdep)
                ids = id_cdeppk_senate.get(prop.cdeppk_senate)
                if idc and ids and idc != ids:
                    logger.warn("Two different records for the same proposal: "
                                "(%s, %s). Removing the 2nd.", idc, ids)
                    models.db.session.delete(models.Proposal.query.get(ids))
                    ids = None
                record['id'] = idc or ids or models.random_uuid()

                result = add_proposal(record)
                row = result.row
                if result.is_changed:
                    changed.append(row)
                seen.append(row)

                new_people = set(by_cdep_id[ci] for ci in prop.sponsorships)
                existing_sponsorships = {sp.mandate: sp
                                         for sp in row.sponsorships}
                to_remove = set(existing_sponsorships) - set(new_people)
                to_add = set(new_people) - set(existing_sponsorships)
                if to_remove:
                    logger.info("Removing sponsors %s: %r", row.id,
                                [cdep_id(m) for m in to_remove])
                    sp_removed += 1
                    for m in to_remove:
                        sp = existing_sponsorships[m]
                        models.db.session.delete(sp)
                if to_add:
                    logger.info("Adding sponsors %s: %r", row.id,
                                [cdep_id(m) for m in to_add])
                    sp_added += 1
                    for m in to_add:
                        row.sponsorships.append(models.Sponsorship(mandate=m))

                if to_remove or to_add:
                    sp_updates += 1

                db_activity = all_activity[row.id]
                db_activity.sort(key=lambda a: a.order)
                act_fields = lambda r: (r.date, r.location)
                if ([act_fields(r) for r in db_activity] !=
                    [act_fields(r) for r in prop.activity[:len(db_activity)]]):
                    logger.warn("History doesn't match for %s, "
                                "%d items will be removed",
                                row.id,len(db_activity))
                    db_activity = []

                for n, ac in enumerate(prop.activity):
                    record = model_to_dict(ac, ['date', 'location', 'html'])
                    record['proposal_id'] = row.id
                    record['order'] = n
                    if n < len(db_activity):
                        item = db_activity[n]
                        record['id'] = item.id
                        assert item.date == record['date']
                        assert item.location == record['location']
                        assert item.order == record['order']
                    else:
                        record['id'] = models.random_uuid()
                    add_activity(record)

    models.db.session.commit()

    logger.info("Updated sponsorship for %d proposals (+%d, -%d)",
                sp_updates, sp_added, sp_removed)

    if autoanalyze:
        logger.info("Scheduling analysis jobs for %d proposals", len(changed))
        for proposal in changed:
            if proposal.pdf_url:
                ocr_proposal.delay(proposal.id, autoanalyze=True)

        logger.info("Scheduling policy jobs for %d proposals", len(seen))
        for proposal in seen:
            if proposal.policy_domain_id is None:
                calculate_proposal.delay(proposal.id)
Ejemplo n.º 12
0
def proposals(dry_run=False):
    from mptracker.scraper.proposals import ProposalScraper

    proposal_scraper = ProposalScraper(create_session(cache_name='page-cache',
                                                      throttle=0.5))

    def cdep_id(mandate):
        return (mandate.year, mandate.cdep_number)

    by_cdep_id = {cdep_id(m): m
                  for m in models.Mandate.query
                  if m.year == 2012}

    chamber_by_slug = {c.slug: c for c in models.Chamber.query}

    proposals = proposal_scraper.fetch_from_mp_pages(set(by_cdep_id.keys()))

    proposal_patcher = TablePatcher(models.Proposal,
                                    models.db.session,
                                    key_columns=['combined_id'])

    sp_updates = sp_added = sp_removed = 0

    with proposal_patcher.process(autoflush=1000, remove=True) as add:
        for record in proposals:
            if 'decision_chamber' in record:
                slug = record.pop('decision_chamber')
                record['decision_chamber'] = chamber_by_slug[slug]

            sponsorships = record.pop('_sponsorships')
            url = record['url']

            result = add(record)
            row = result.row

            new_people = set(by_cdep_id[ci] for ci in sponsorships)
            existing_sponsorships = {sp.mandate: sp for sp in row.sponsorships}
            to_remove = set(existing_sponsorships) - set(new_people)
            to_add = set(new_people) - set(existing_sponsorships)
            if to_remove:
                logger.info("Removing sponsors %s: %r", row.combined_id,
                            [cdep_id(m) for m in to_remove])
                sp_removed += 1
                for m in to_remove:
                    sp = existing_sponsorships[m]
                    models.db.session.delete(sp)
            if to_add:
                logger.info("Adding sponsors %s: %r", row.combined_id,
                            [cdep_id(m) for m in to_add])
                sp_added += 1
                for m in to_add:
                    row.sponsorships.append(models.Sponsorship(mandate=m))

            if to_remove or to_add:
                sp_updates += 1

        if dry_run:
            models.db.session.rollback()

    logger.info("Updated sponsorship for %d proposals (+%d, -%d)",
                sp_updates, sp_added, sp_removed)