def ocr_url(url, max_pages=MAX_OCR_PAGES): from mptracker.scraper.common import create_session pdf_cache_name = flask.current_app.config.get("MPTRACKER_PDF_CACHE") http_session = create_session(cache_name=pdf_cache_name, throttle=0.5) with temp_dir() as tmp: resp = http_session.get(url) if resp.status_code == 404: # cdep.ro doesn't have the PDF; skip it return [] if resp.status_code != 200: raise RuntimeError("PDF download failure (%d) at %r" % (resp.status_code, url)) pdf_data = resp.content pdf_path = tmp / "document.pdf" with pdf_path.open("wb") as f: f.write(pdf_data) subprocess.check_call(["pdfimages", pdf_path, tmp / "img"]) pages = [] for image_path in sorted(tmp.listdir("img-*"))[:MAX_OCR_PAGES]: subprocess.check_call(["tesseract", image_path, image_path, "-l", "ron"], stderr=subprocess.DEVNULL) text = (image_path + ".txt").text() pages.append(text) return pages
def ocr_url(url, max_pages=MAX_OCR_PAGES): from mptracker.scraper.common import create_session pdf_cache_name = flask.current_app.config.get('MPTRACKER_PDF_CACHE') http_session = create_session(cache_name=pdf_cache_name, throttle=0.5) with temp_dir() as tmp: resp = http_session.get(url) if resp.status_code != 200: raise RuntimeError("PDF download failure (%d) at %r" % (resp.status_code, url)) pdf_data = resp.content pdf_path = tmp / 'document.pdf' with pdf_path.open('wb') as f: f.write(pdf_data) subprocess.check_call(['pdfimages', pdf_path, tmp / 'img']) pages = [] for image_path in sorted(tmp.listdir('img-*'))[:MAX_OCR_PAGES]: subprocess.check_call( ['tesseract', image_path, image_path, '-l', 'ron'], stderr=subprocess.DEVNULL) text = (image_path + '.txt').text() pages.append(text) return pages
def committees( cache_name=None, throttle=None, no_commit=False, ): from mptracker.scraper.committees import CommitteeScraper patcher = TablePatcher( models.MpCommittee, models.db.session, key_columns=['chamber_id', 'cdep_id'], ) http_session = create_session( cache_name=cache_name, throttle=throttle and float(throttle), ) scraper = CommitteeScraper(http_session) with patcher.process(autoflush=1000, remove=True) as add: for committee in scraper.fetch_committees(): add(committee.as_dict(['chamber_id', 'cdep_id', 'name'])) if no_commit: logger.warn("Rolling back the transaction") models.db.session.rollback() else: models.db.session.commit()
def ocr_url(url, max_pages=MAX_OCR_PAGES): from mptracker.scraper.common import create_session pdf_cache_name = flask.current_app.config.get('MPTRACKER_PDF_CACHE') http_session = create_session(cache_name=pdf_cache_name, throttle=0.5) with temp_dir() as tmp: resp = http_session.get(url) if resp.status_code != 200: raise RuntimeError("PDF download failure (%d) at %r" % (resp.status_code, url)) pdf_data = resp.content pdf_path = tmp / 'document.pdf' with pdf_path.open('wb') as f: f.write(pdf_data) subprocess.check_call(['pdfimages', pdf_path, tmp / 'img']) pages = [] for image_path in sorted(tmp.listdir('img-*'))[:MAX_OCR_PAGES]: subprocess.check_call(['tesseract', image_path, image_path, '-l', 'ron'], stderr=subprocess.DEVNULL) text = (image_path + '.txt').text() pages.append(text) return pages
def get_proposal_single_page( chamber, pk, cache_name=None, ): import pickle import yaml import os from mptracker.scraper.proposals import ProposalScraper SCRAPER_PACKAGE = path(__file__).abspath().parent PROJECT_ROOT = SCRAPER_PACKAGE.parent.parent session = create_session(cache_name=cache_name or _get_config_cache_name()) scraper = ProposalScraper(session) pk = int(pk) chamber = int(chamber) record = { 'pk': pk, 'chamber': chamber, 'date': date.today(), } logger.info("scraping %d %d", chamber, pk) result = scraper.scrape_proposal_page(chamber, pk) keylist = ["date", "html", "location"] filename = PROJECT_ROOT / "proposals" / "proposal_{pk_p}_{chamber_p}.yml".format( pk_p = pk, chamber_p = chamber) # ref or not activity_list = result['activity'].copy() if 'activity' in result: del result['activity'] dict_list = [k.as_dict(keylist) for k in activity_list] result['activity'] = dict_list if not os.path.exists(PROJECT_ROOT / "proposals"): os.makedirs(PROJECT_ROOT / "proposals") outfile = open(filename, "w") outfile.write(yaml.dump(result, default_flow_style=True))
def get_proposal_pages( throttle=None, cache_name=None, year=None, ): from itertools import chain from mptracker.scraper.proposals import ProposalScraper session = create_session( cache_name=cache_name or _get_config_cache_name(), throttle=float(throttle) if throttle else None, ) scraper = ProposalScraper(session) for record in chain(scraper.list_proposals(2, year), scraper.list_proposals(1, year)): get_proposal_single_page(record['chamber'], record['pk'], cache_name)
def questions(year='2013'): from mptracker.scraper.questions import QuestionScraper known_urls = set(q.url for q in models.Question.query) def skip_question(url): return url in known_urls questions_scraper = QuestionScraper(session=create_session(throttle=0.5), skip=skip_question) mandate_lookup = models.MandateLookup() question_patcher = TablePatcher(models.Question, models.db.session, key_columns=['number', 'date']) with question_patcher.process() as add: for question in questions_scraper.run(int(year)): name, person_year, person_number = question.pop('person') mandate = mandate_lookup.find(name, person_year, person_number) question['mandate_id'] = mandate.id question['addressee'] = '; '.join(question['addressee']) add(question)
def committees( cache_name=None, throttle=None, no_commit=False, ): from mptracker.scraper.committees import CommitteeScraper mandate_lookup = models.MandateLookup() http_session = create_session( cache_name=cache_name, throttle=throttle and float(throttle), ) scraper = CommitteeScraper(http_session) committee_patcher = TablePatcher( models.MpCommittee, models.db.session, key_columns=['chamber_id', 'cdep_id'], ) membership_patcher = TablePatcher( models.MpCommitteeMembership, models.db.session, key_columns=['mandate_id', 'mp_committee_id', 'interval'], ) with committee_patcher.process(remove=True) as add_committee, \ membership_patcher.process(remove=True) as add_membership: for committee in scraper.fetch_committees(): res = add_committee( committee.as_dict(['chamber_id', 'cdep_id', 'name']), ) if res.is_new: models.db.session.flush() mp_committee = res.row for member in committee.current_members + committee.former_members: if member.end_date and member.end_date < TERM_2012_START: logger.warn( "Membership end date is before the 2012 " "term started, skipping: %r %r %r", member.mp_name, committee.name, member.end_date, ) continue interval = DateRange( member.start_date or TERM_2012_START, member.end_date or date.max, ) if interval.lower > interval.upper: import pdb; pdb.set_trace() mandate = mandate_lookup.find( member.mp_name, member.mp_ident.year, member.mp_ident.number, ) add_membership({ 'role': member.role, 'interval': interval, 'mandate_id': mandate.id, 'mp_committee_id': mp_committee.id, }) if no_commit: logger.warn("Rolling back the transaction") models.db.session.rollback() else: models.db.session.commit()
def votes( start=None, days=1, cache_name=None, throttle=None, no_commit=False, autoanalyze=False, ): from mptracker.scraper.votes import VoteScraper if start is None: start = models.db.session.execute( 'select date from voting_session ' 'order by date desc limit 1').scalar() + ONE_DAY else: start = parse_date(start) days = int(days) http_session = create_session(cache_name=cache_name, throttle=throttle and float(throttle)) vote_scraper = VoteScraper(http_session) voting_session_patcher = TablePatcher( models.VotingSession, models.db.session, key_columns=['cdeppk'], ) vote_patcher = TablePatcher( models.Vote, models.db.session, key_columns=['voting_session_id', 'mandate_id'], ) proposal_ids = {p.cdeppk_cdep: p.id for p in models.Proposal.query} mandate_lookup = models.MandateLookup() new_voting_session_list = [] with voting_session_patcher.process() as add_voting_session: with vote_patcher.process() as add_vote: for delta in range(days): the_date = start + ONE_DAY * delta if the_date >= date.today(): # don't scrape today, maybe voting is not done yet! break logger.info("Scraping votes from %s", the_date) for voting_session in vote_scraper.scrape_day(the_date): record = model_to_dict( voting_session, ['cdeppk', 'subject', 'subject_html'], ) record['date'] = the_date proposal_cdeppk = voting_session.proposal_cdeppk record['proposal_id'] = (proposal_ids.get(proposal_cdeppk) if proposal_cdeppk else None) record['final'] = bool("vot final" in record['subject'].lower()) vs = add_voting_session(record).row if vs.id is None: models.db.session.flush() new_voting_session_list.append(vs.id) for vote in voting_session.votes: record = model_to_dict(vote, ['choice']) record['voting_session_id'] = vs.id mandate = mandate_lookup.find( vote.mandate_name, vote.mandate_year, vote.mandate_number, ) record['mandate_id'] = mandate.id add_vote(record) if no_commit: logger.warn("Rolling back the transaction") models.db.session.rollback() else: models.db.session.commit() if autoanalyze: from mptracker.votes import calculate_voting_session_loyalty logger.info("Scheduling %d jobs", len(new_voting_session_list)) for voting_session_id in new_voting_session_list: calculate_voting_session_loyalty.delay(voting_session_id)
def transcripts(start=None, n_sessions=1, cache_name=None, throttle=None): from mptracker.scraper.transcripts import TranscriptScraper if start is None: max_serial = models.db.session.execute( 'select serial from transcript_chapter ' 'order by serial desc limit 1').scalar() start = int(max_serial.split('/')[0]) + 1 cdeppk = int(start) - 1 n_sessions = int(n_sessions) transcript_scraper = TranscriptScraper( session=create_session(cache_name=cache_name, throttle=throttle and float(throttle))) mandate_lookup = models.MandateLookup() transcript_patcher = TablePatcher(models.Transcript, models.db.session, key_columns=['serial']) with transcript_patcher.process() as add: while n_sessions > 0: n_sessions -= 1 cdeppk += 1 logger.info("Fetching session %s", cdeppk) session_data = transcript_scraper.fetch_session(cdeppk) if session_data is None: logger.info("No content") continue for chapter in session_data.chapters: chapter_row = (models.TranscriptChapter.query .filter_by(serial=chapter.serial) .first()) if chapter_row is None: chapter_row = models.TranscriptChapter( serial=chapter.serial) models.db.session.add(chapter_row) models.db.session.flush() chapter_row.date = session_data.date chapter_row.headline = chapter.headline for paragraph in chapter.paragraphs: if paragraph['mandate_chamber'] != 2: continue try: mandate = mandate_lookup.find( paragraph['speaker_name'], paragraph['mandate_year'], paragraph['mandate_number']) except models.LookupError as e: logger.warn("at %s %s", paragraph['serial'], e) continue transcript_data = { 'chapter_id': chapter_row.id, 'text': paragraph['text'], 'serial': paragraph['serial'], 'mandate_id': mandate.id, } add(transcript_data) models.db.session.commit()
def proposals( cache_name=None, throttle=None, autoanalyze=False, ): from mptracker.scraper.proposals import ProposalScraper from mptracker.proposals import ocr_proposal from mptracker.policy import calculate_proposal proposal_scraper = ProposalScraper(create_session( cache_name=cache_name, throttle=float(throttle) if throttle else None)) def cdep_id(mandate): return (mandate.year, mandate.cdep_number) by_cdep_id = {cdep_id(m): m for m in models.Mandate.query if m.year == 2012} id_cdeppk_cdep = {} id_cdeppk_senate = {} for proposal in models.Proposal.query: if proposal.cdeppk_cdep: id_cdeppk_cdep[proposal.cdeppk_cdep] = proposal.id if proposal.cdeppk_senate: id_cdeppk_senate[proposal.cdeppk_senate] = proposal.id chamber_by_slug = {c.slug: c for c in models.Chamber.query} proposals = proposal_scraper.fetch_from_mp_pages(set(by_cdep_id.keys())) all_activity = defaultdict(list) for item in models.ProposalActivityItem.query: all_activity[item.proposal_id].append(item) proposal_patcher = TablePatcher(models.Proposal, models.db.session, key_columns=['id']) activity_patcher = TablePatcher(models.ProposalActivityItem, models.db.session, key_columns=['id']) sp_updates = sp_added = sp_removed = 0 changed = [] seen = [] with proposal_patcher.process(autoflush=1000, remove=True) as add_proposal: with activity_patcher.process(autoflush=1000, remove=True) \ as add_activity: for prop in proposals: record = model_to_dict(prop, ['cdeppk_cdep', 'cdeppk_senate', 'decision_chamber', 'url', 'title', 'date', 'number_bpi', 'number_cdep', 'number_senate', 'proposal_type', 'pdf_url']) slug = prop.decision_chamber if slug: record['decision_chamber'] = chamber_by_slug[slug] idc = id_cdeppk_cdep.get(prop.cdeppk_cdep) ids = id_cdeppk_senate.get(prop.cdeppk_senate) if idc and ids and idc != ids: logger.warn("Two different records for the same proposal: " "(%s, %s). Removing the 2nd.", idc, ids) models.db.session.delete(models.Proposal.query.get(ids)) ids = None record['id'] = idc or ids or models.random_uuid() result = add_proposal(record) row = result.row if result.is_changed: changed.append(row) seen.append(row) new_people = set(by_cdep_id[ci] for ci in prop.sponsorships) existing_sponsorships = {sp.mandate: sp for sp in row.sponsorships} to_remove = set(existing_sponsorships) - set(new_people) to_add = set(new_people) - set(existing_sponsorships) if to_remove: logger.info("Removing sponsors %s: %r", row.id, [cdep_id(m) for m in to_remove]) sp_removed += 1 for m in to_remove: sp = existing_sponsorships[m] models.db.session.delete(sp) if to_add: logger.info("Adding sponsors %s: %r", row.id, [cdep_id(m) for m in to_add]) sp_added += 1 for m in to_add: row.sponsorships.append(models.Sponsorship(mandate=m)) if to_remove or to_add: sp_updates += 1 db_activity = all_activity[row.id] db_activity.sort(key=lambda a: a.order) act_fields = lambda r: (r.date, r.location) if ([act_fields(r) for r in db_activity] != [act_fields(r) for r in prop.activity[:len(db_activity)]]): logger.warn("History doesn't match for %s, " "%d items will be removed", row.id,len(db_activity)) db_activity = [] for n, ac in enumerate(prop.activity): record = model_to_dict(ac, ['date', 'location', 'html']) record['proposal_id'] = row.id record['order'] = n if n < len(db_activity): item = db_activity[n] record['id'] = item.id assert item.date == record['date'] assert item.location == record['location'] assert item.order == record['order'] else: record['id'] = models.random_uuid() add_activity(record) models.db.session.commit() logger.info("Updated sponsorship for %d proposals (+%d, -%d)", sp_updates, sp_added, sp_removed) if autoanalyze: logger.info("Scheduling analysis jobs for %d proposals", len(changed)) for proposal in changed: if proposal.pdf_url: ocr_proposal.delay(proposal.id, autoanalyze=True) logger.info("Scheduling policy jobs for %d proposals", len(seen)) for proposal in seen: if proposal.policy_domain_id is None: calculate_proposal.delay(proposal.id)
def groups( cache_name=None, throttle=None, no_commit=False, ): from mptracker.scraper.groups import GroupScraper, Interval http_session = create_session(cache_name=cache_name, throttle=throttle and float(throttle)) group_scraper = GroupScraper(http_session) mandate_lookup = models.MandateLookup() mandate_intervals = defaultdict(list) groups = list(group_scraper.fetch()) independents = groups[0] assert independents.is_independent for group in groups[1:] + [independents]: for member in group.current_members + group.former_members: (year, chamber, number) = member.mp_ident assert chamber == 2 mandate = mandate_lookup.find(member.mp_name, year, number) interval_list = mandate_intervals[mandate] interval = member.get_interval() if interval.start is None: interval = interval._replace(start=TERM_2012_START) if group.is_independent: if interval_list: start = interval_list[-1].end interval = interval._replace(start=start) interval_list.append(interval) interval_list.sort(key=lambda i: i[0]) for mandate, interval_list in mandate_intervals.items(): # make sure interval_list are continuous new_intervals = [] for interval_one, interval_two in \ zip(interval_list[:-1], interval_list[1:]): assert interval_one.start < interval_one.end if interval_one.end < interval_two.start: interval = Interval( start=interval_one.end, end=interval_two.start, group=independents, ) new_intervals.append(interval) elif interval_one.end > interval_two.start: raise RuntimeError("Overlapping intervals") interval_list.extend(new_intervals) interval_list.sort() mandate_end = mandate.interval.upper if mandate_end == date.max: mandate_end = None if interval_list[-1].end != mandate_end: logger.warn("Mandate %s ends at %s", mandate, interval_list[-1].end) group_patcher = TablePatcher( models.MpGroup, models.db.session, key_columns=['short_name'], ) with group_patcher.process(remove=True) as add_group: for group in groups: record = group.as_dict(['name', 'short_name']) group.row = add_group(record).row models.db.session.flush() membership_patcher = TablePatcher( models.MpGroupMembership, models.db.session, key_columns=['mandate_id', 'mp_group_id', 'interval'], ) with membership_patcher.process( autoflush=1000, remove=True, ) as add_membership: for mandate, interval_list in mandate_intervals.items(): for interval in interval_list: row = add_membership({ 'mandate_id': mandate.id, 'mp_group_id': interval.group.row.id, 'interval': DateRange( interval.start or date.min, interval.end or date.max, ), }).row if no_commit: logger.warn("Rolling back the transaction") models.db.session.rollback() else: models.db.session.commit()
def questions( year='2013', reimport_existing=False, cache_name=None, throttle=None, autoanalyze=False, ): from mptracker.scraper.questions import QuestionScraper from mptracker.questions import ocr_question from mptracker.policy import calculate_question if reimport_existing: known_urls = set() else: known_urls = set(q.url for q in models.Question.query) def skip_question(url): return url in known_urls http_session = create_session(cache_name=cache_name, throttle=throttle and float(throttle), counters=True) questions_scraper = QuestionScraper(session=http_session, skip=skip_question) mandate_lookup = models.MandateLookup() question_patcher = TablePatcher(models.Question, models.db.session, key_columns=['number', 'date']) new_ask_rows = 0 changed = [] with question_patcher.process() as add: for question in questions_scraper.run(int(year)): person_list = question.pop('person') question['addressee'] = '; '.join(question['addressee']) result = add(question) q = result.row old_asked = {ask.mandate_id: ask for ask in q.asked} for name, person_year, person_number in person_list: mandate = mandate_lookup.find(name, person_year, person_number) if mandate.id in old_asked: old_asked.pop(mandate.id) else: ask = models.Ask(mandate=mandate) q.asked.append(ask) ask.set_meta('new', True) logger.info("Adding ask for %s: %s", q, mandate) new_ask_rows += 1 if result.is_changed: changed.append(q) assert not old_asked models.db.session.commit() if new_ask_rows: logger.info("Added %d ask records", new_ask_rows) counters = http_session.counters logger.info("HTTP: %d kb in %s requests, %.2f seconds", counters['bytes'] / 1024, counters['requests'], counters['download_time'].total_seconds()) if autoanalyze: logger.info("Scheduling jobs for %d questions", len(changed)) for question in changed: if question.pdf_url: ocr_question.delay(question.id, autoanalyze=True) if question.policy_domain_id is None: calculate_question.delay(question.id)
def people( year='2012', cache_name=None, throttle=None, no_commit=False, ): from mptracker.scraper.people import MandateScraper http_session = create_session( cache_name=cache_name, throttle=throttle and float(throttle), ) mandate_scraper = MandateScraper(http_session) mandate_patcher = TablePatcher( models.Mandate, models.db.session, key_columns=['year', 'cdep_number'], ) with mandate_patcher.process() as add_mandate: for mandate in mandate_scraper.fetch(year): row = mandate.as_dict([ 'year', 'cdep_number', 'minority', 'college', 'constituency', ]) if year == '2012': end_date = mandate.end_date or date.max row['interval'] = DateRange(TERM_2012_START, end_date) person = ( models.Person.query .filter_by(name=mandate.person_name) .first()) if person is None: raise RuntimeError("Can't find person named %r" % mandate.person_name) row['person_id'] = person.id if not mandate.minority: county = ( models.County.query .filter_by(name=mandate.county_name) .first()) if county is None: raise RuntimeError("Can't match county name %r" % mandate.county_name) row['county'] = county add_mandate(row) if no_commit: logger.warn("Rolling back the transaction") models.db.session.rollback() else: models.db.session.commit()
def proposals(dry_run=False): from mptracker.scraper.proposals import ProposalScraper proposal_scraper = ProposalScraper(create_session(cache_name='page-cache', throttle=0.5)) def cdep_id(mandate): return (mandate.year, mandate.cdep_number) by_cdep_id = {cdep_id(m): m for m in models.Mandate.query if m.year == 2012} chamber_by_slug = {c.slug: c for c in models.Chamber.query} proposals = proposal_scraper.fetch_from_mp_pages(set(by_cdep_id.keys())) proposal_patcher = TablePatcher(models.Proposal, models.db.session, key_columns=['combined_id']) sp_updates = sp_added = sp_removed = 0 with proposal_patcher.process(autoflush=1000, remove=True) as add: for record in proposals: if 'decision_chamber' in record: slug = record.pop('decision_chamber') record['decision_chamber'] = chamber_by_slug[slug] sponsorships = record.pop('_sponsorships') url = record['url'] result = add(record) row = result.row new_people = set(by_cdep_id[ci] for ci in sponsorships) existing_sponsorships = {sp.mandate: sp for sp in row.sponsorships} to_remove = set(existing_sponsorships) - set(new_people) to_add = set(new_people) - set(existing_sponsorships) if to_remove: logger.info("Removing sponsors %s: %r", row.combined_id, [cdep_id(m) for m in to_remove]) sp_removed += 1 for m in to_remove: sp = existing_sponsorships[m] models.db.session.delete(sp) if to_add: logger.info("Adding sponsors %s: %r", row.combined_id, [cdep_id(m) for m in to_add]) sp_added += 1 for m in to_add: row.sponsorships.append(models.Sponsorship(mandate=m)) if to_remove or to_add: sp_updates += 1 if dry_run: models.db.session.rollback() logger.info("Updated sponsorship for %d proposals (+%d, -%d)", sp_updates, sp_added, sp_removed)
def people( year='2012', cache_name=None, throttle=None, no_commit=False, add_people=False, ): from mptracker.scraper.people import MandateScraper http_session = create_session( cache_name=cache_name, throttle=throttle and float(throttle), ) mandate_scraper = MandateScraper(http_session) mandate_patcher = TablePatcher( models.Mandate, models.db.session, key_columns=['year', 'cdep_number'], ) person_patcher = TablePatcher( models.Person, models.db.session, key_columns=['id'], ) new_people = 0 chamber_by_slug = {c.slug: c for c in models.Chamber.query} with mandate_patcher.process() as add_mandate, \ person_patcher.process() as add_person: for mandate in mandate_scraper.fetch(year): row = mandate.as_dict([ 'year', 'cdep_number', 'minority', 'college', 'constituency', 'picture_url', ]) assert mandate.chamber_number == 2 row['chamber_id'] = chamber_by_slug['cdep'].id if year == '2012': end_date = mandate.end_date or date.max row['interval'] = DateRange(TERM_2012_START, end_date) person = ( models.Person.query .filter_by(name=mandate.person_name) .first()) if person is None: if add_people: person = models.Person(name=mandate.person_name) models.db.session.add(person) models.db.session.flush() new_people += 1 else: raise RuntimeError("Can't find person named %r" % mandate.person_name) assert not add_person({ 'id': person.id, 'first_name': mandate.person_first_name, 'last_name': mandate.person_last_name, }).is_new row['person_id'] = person.id if not mandate.minority: county = ( models.County.query .filter_by(name=mandate.county_name) .first()) if county is None: raise RuntimeError("Can't match county name %r" % mandate.county_name) row['county'] = county add_mandate(row) if new_people: logger.info("%d new people", new_people) if no_commit: logger.warn("Rolling back the transaction") models.db.session.rollback() else: models.db.session.commit()
def votes( start=None, n_days=1, cache_name=None, throttle=None, ): from mptracker.scraper.votes import VoteScraper if start is None: start = models.db.session.execute( 'select date from voting_session ' 'order by date desc limit 1').scalar() + ONE_DAY else: start = parse_date(start) n_days = int(n_days) http_session = create_session(cache_name=cache_name, throttle=throttle and float(throttle)) vote_scraper = VoteScraper(http_session) voting_session_patcher = TablePatcher( models.VotingSession, models.db.session, key_columns=['cdeppk'], ) vote_patcher = TablePatcher( models.Vote, models.db.session, key_columns=['voting_session_id', 'mandate_id'], ) proposal_ids = {p.cdeppk_cdep: p.id for p in models.Proposal.query} mandate_lookup = models.MandateLookup() with voting_session_patcher.process() as add_voting_session: with vote_patcher.process() as add_vote: for delta in range(n_days): the_date = start + ONE_DAY * delta if the_date >= date.today(): # don't scrape today, maybe voting is not done yet! break logger.info("Scraping votes from %s", the_date) for voting_session in vote_scraper.scrape_day(the_date): record = model_to_dict( voting_session, ['cdeppk', 'subject', 'subject_html'], ) record['date'] = the_date proposal_cdeppk = voting_session.proposal_cdeppk record['proposal_id'] = (proposal_ids.get(proposal_cdeppk) if proposal_cdeppk else None) vs = add_voting_session(record).row if vs.id is None: models.db.session.flush() for vote in voting_session.votes: record = model_to_dict(vote, ['choice']) record['voting_session_id'] = vs.id mandate = mandate_lookup.find( vote.mandate_name, vote.mandate_year, vote.mandate_number, ) record['mandate_id'] = mandate.id add_vote(record) models.db.session.commit()
def groups( cache_name=None, throttle=None, no_commit=False, year='2012', ): year = int(year) from mptracker.scraper.groups import GroupScraper, Interval http_session = create_session(cache_name=cache_name, throttle=throttle and float(throttle)) group_scraper = GroupScraper(http_session) mandate_lookup = models.MandateLookup() mandate_intervals = defaultdict(list) term_interval = TERM_INTERVAL[year] groups = list(group_scraper.fetch(year)) independents = groups[0] assert independents.is_independent for group in groups[1:] + [independents]: for member in group.current_members + group.former_members: (myear, chamber, number) = member.mp_ident assert chamber == 2 mandate = mandate_lookup.find(member.mp_name, myear, number) interval_list = mandate_intervals[mandate] interval = member.get_interval() if interval.start is None: interval = interval._replace(start=term_interval.lower) if interval.end is None: interval = interval._replace(end=term_interval.upper) if group.is_independent: if interval_list: start = interval_list[-1].end interval = interval._replace(start=start) interval_list.append(interval) interval_list.sort(key=lambda i: i[0]) for mandate, interval_list in mandate_intervals.items(): # make sure interval_list are continuous new_intervals = [] for interval_one, interval_two in \ zip(interval_list[:-1], interval_list[1:]): assert interval_one.start < interval_one.end if interval_one.end < interval_two.start: interval = Interval( start=interval_one.end, end=interval_two.start, group=independents, ) new_intervals.append(interval) elif interval_one.end > interval_two.start: import pdb; pdb.set_trace() raise RuntimeError("Overlapping intervals") interval_list.extend(new_intervals) interval_list.sort() mandate_end = mandate.interval.upper if mandate_end == date.max: mandate_end = None if interval_list[-1].end != mandate_end: logger.warn("Mandate %s ends at %s", mandate, interval_list[-1].end) group_patcher = TablePatcher( models.MpGroup, models.db.session, key_columns=['short_name', 'year'], ) with group_patcher.process(remove=True, filter={'year': year}) as add_group: for group in groups: record = group.as_dict(['name', 'short_name', 'year']) group.row = add_group(record).row models.db.session.flush() membership_patcher = TablePatcher( models.MpGroupMembership, models.db.session, key_columns=['mandate_id', 'mp_group_id', 'interval'], ) current_membership_query = ( models.db.session.query(models.MpGroupMembership.id) .join(models.MpGroupMembership.mandate) .filter_by(year=year) ) remove_membership_ids = set(row.id for row in current_membership_query) with membership_patcher.process(autoflush=1000) as add_membership: for mandate, interval_list in mandate_intervals.items(): for interval in interval_list: res = add_membership({ 'mandate_id': mandate.id, 'mp_group_id': interval.group.row.id, 'interval': DateRange( interval.start or date.min, interval.end or date.max, ), }) if not res.is_new: remove_membership_ids.remove(res.row.id) if remove_membership_ids: unseen_items = ( models.MpGroupMembership.query .filter(models.MpGroupMembership.id.in_(remove_membership_ids)) ) unseen_items.delete(synchronize_session=False) logger.info("Deleted %d stale memberships", len(remove_membership_ids)) if no_commit: logger.warn("Rolling back the transaction") models.db.session.rollback() else: models.db.session.commit()