def __init__(self): # login and start session with pmg website logger.info("LOGGING IN") self.session = requests.Session() headers = {'user-agent': 'Mozilla/4.0 (compatible: MSIE 6.0)'} try: data = { 'name': config['name'], 'pass': config['pass'], 'form_id': 'user_login', 'form_build_id': 'form-ee72095493d7ed912673b8a83219772c', 'op': 'Log in' } r = self.session.post('http://www.pmg.org.za/user/login', headers=headers, data=data) if not "Welcome back." in r.content: logger.error("Login was not successful") raise Exception except Exception as e: import traceback; traceback.print_exc() logger.error("Configuration Error:") logger.error("Please ensure that a file called 'scraper_config.json' exists in the scraper directory, and that it contains" \ "valid 'username' and 'password' parameters for logging in to the PMG website. This is needed for accessing " \ "much of the content.") raise e self.stats = {}
def run_scraper(self): committees = Agent.query.filter(Agent.type == "committee").filter(Agent.url != None).all() shuffle(committees) # randomize the order, just to keep things interesting for i, committee in enumerate(committees): self.current_committee = committee self.current_url = committee.url try: self.current_page = scrapertools.URLFetcher(self.current_url, self.session).html logger.debug("Committee: " + str(committee.name)) self.scrape_committee() # give some progress feedback logger.info(str(i + 1) + " out of " + str(len(committees)) + " committees' reports have been scraped.") logger.info(json.dumps(self.stats, indent=4)) # commit entries to database, once per committee logger.debug("SAVING TO DATABASE") db.session.commit() except Exception as e: msg = "Error scraping committee's reports." self.stats["errors"].append(msg) logger.error(msg) logger.exception(str(e)) return
def run_scraper(self): """ Iterate through bill pages, and run the state machine for each page. """ pager = Pager() # iterate through bill pages for url in pager.next_page: logger.info(url) # initiate parser for this page self.state_fn = self.start_state html = scrapertools.URLFetcher(url, self.session).html soup = BeautifulSoup(html, convertEntities=BeautifulSoup.HTML_ENTITIES) table = soup.find("tbody") rows = table.findAll("tr") # feed rows into state machine for row in rows: while not self.state_fn(row): pass # commit to database after each page # db.session.commit() return
def run_scraper(self): committees = Agent.query.filter(Agent.type == "committee").filter( Agent.url != None).all() shuffle( committees) # randomize the order, just to keep things interesting for i, committee in enumerate(committees): self.current_committee = committee self.current_url = committee.url try: self.current_page = scrapertools.URLFetcher( self.current_url, self.session).html logger.debug("Committee: " + str(committee.name)) self.scrape_committee() # give some progress feedback logger.info( str(i + 1) + " out of " + str(len(committees)) + " committees' reports have been scraped.") logger.info(json.dumps(self.stats, indent=4)) # commit entries to database, once per committee logger.debug("SAVING TO DATABASE") db.session.commit() except Exception as e: msg = "Error scraping committee's reports." self.stats["errors"].append(msg) logger.error(msg) logger.exception(str(e)) return
def scrape_bills(self): logger.info("\n ----------- SCRAPING BILLS ---------------") bill_scraper = bills.BillScraper(self.session) bill_scraper.run_scraper() logger.info(json.dumps(bill_scraper.stats, indent=4)) return
def scrape_committees(self): logger.info("\n ----------- SCRAPING COMMITTEES ---------------") committee_scraper = committees.CommitteeScraper(self.session) committee_scraper.run_scraper() logger.info(json.dumps(committee_scraper.stats, indent=4)) return
def scrape_hansards(self): logger.info("\n ----------- SCRAPING HANSARDS ---------------") hansard_scraper = hansards.HansardScraper(self.session) hansard_scraper.run_scraper() logger.info(json.dumps(hansard_scraper.stats, indent=4)) return
def scrape_committee_reports(self): logger.info("\n ----------- SCRAPING COMMITTEE REPORTS ---------------") report_scraper = committee_reports.ReportScraper(self.session) report_scraper.run_scraper() logger.info(json.dumps(report_scraper.stats, indent=4)) return
def find_enacted_bills(): """ Set status of bills that have already been enacted. """ for bill in Bill.query.all(): for entry in bill.entries: if entry.type == "act": bill.status = "enacted" db.session.add(bill) logger.info("enacted: " + bill.name) break db.session.commit() return
def find_current_bills(): """ Update status of most recent set of bills from http://pmg.org.za/billsstatus/proceedings, via the csv at /data/current_status.csv """ data = [] with open("../data/current_status.csv", 'Ur') as f: reader = csv.reader(f) headers = reader.next() for i, row in enumerate(reader): entry = row # fix bill types if entry[0].startswith("PM"): entry[0] = "PMB" + entry[0][2::] elif not entry[0].startswith("B"): entry[0] = "B" + entry[0] tmp_code = entry[0] tmp_status = entry[1].lower() # clean bill code tmp = analyze_bill_code(tmp_code) code = tmp["code"] logger.info(code + " " + str(entry)) bill = Bill.query.filter(Bill.code==code).first() available_status = { "act": "enacted", "": None, "pc": "na", "sc": "ncop", "intro": "na", } if available_status.get(tmp_status): tmp_status = available_status[tmp_status] bill.status = tmp_status db.session.add(bill) db.session.commit() return
def add_or_update(self): """ Add current_report to database, or update the record if it already exists. """ report = Entry.query.filter_by(agent_id=self.current_committee.agent_id) \ .filter_by(url=self.current_report['url'])\ .filter_by(is_deleted=False).first() if report is None: report = Entry() self.stats["new_committee_reports"] += 1 tmp_bills = None if self.current_report.get('bills'): tmp_bills = self.current_report['bills'] logger.info(str(tmp_bills)) report = scrapertools.populate_entry(report, self.current_report, tmp_bills) db.session.add(report) self.stats["total_committee_reports"] += 1 self.current_report = {} return
def run(self, rebuild_db=False, set_status=False): start_time = datetime.datetime.now() logger.info("Started at " + str(start_time)) # start with a clean db if needed if rebuild_db: self.rebuild_db() # scrape content, and add to db self.scrape_bills() # self.scrape_hansards() self.scrape_committees() self.scrape_committee_reports() # update historic bill status data if set_status: bill_status.find_current_bills() bill_status.find_enacted_bills() bill_status.handle_assent() logger.info("Finished scraping at " + str(datetime.datetime.now())) logger.info("Duration: " + str(datetime.datetime.now() - start_time)) return
agent.location = self.current_committee['location'] self.stats["new_committees"] += 1 agent.url = self.current_committee['url'] db.session.add(agent) self.stats["total_committees"] += 1 self.current_committee = {} return @property def next_committee(self): html = scrapertools.URLFetcher("http://www.pmg.org.za/committees", self.session).html soup = BeautifulSoup(html, convertEntities=BeautifulSoup.HTML_ENTITIES) container = soup.find(id="committees-all") committee_lists = container.findAll("div", {"class": "item-list"}) for committee_list in committee_lists: list_name = committee_list.find('h3').contents[0] logger.debug("\n" + list_name + ":") committees = committee_list.findAll('li') for committee in committees: href = "http://www.pmg.org.za" + committee.find('a').attrs[0][1] name = committee.find('a').contents[0] logger.debug("\t" + name) yield list_name, href, name if __name__ == "__main__": committee_scraper = CommitteeScraper() committee_scraper.run_scraper() logger.info(json.dumps(committee_scraper.stats, indent=4))
break return def add_or_update(self): """ Add current_hansard to database, or update the record if it already exists. """ self.current_hansard['entry_type'] = "hansard" bills = [] if self.current_hansard.get('bills'): bills = self.current_hansard["bills"] # TODO: improve filtering hansard = Entry.query.filter(Entry.type == "hansard").filter( Entry.title == self.current_hansard['title']).first() if hansard is None: hansard = Entry() self.stats["new_hansards"] += 1 hansard = scrapertools.populate_entry(hansard, self.current_hansard, bills) db.session.add(hansard) self.stats["total_hansards"] += 1 return if __name__ == "__main__": hansard_scraper = HansardScraper() hansard_scraper.run_scraper() logger.info(json.dumps(hansard_scraper.stats, indent=4))
break return def add_or_update(self): """ Add current_hansard to database, or update the record if it already exists. """ self.current_hansard['entry_type'] = "hansard" bills = [] if self.current_hansard.get('bills'): bills = self.current_hansard["bills"] # TODO: improve filtering hansard = Entry.query.filter(Entry.type=="hansard").filter(Entry.title==self.current_hansard['title']).first() if hansard is None: hansard = Entry() self.stats["new_hansards"] += 1 hansard = scrapertools.populate_entry(hansard, self.current_hansard, bills) db.session.add(hansard) self.stats["total_hansards"] += 1 return if __name__ == "__main__": hansard_scraper = HansardScraper() hansard_scraper.run_scraper() logger.info(json.dumps(hansard_scraper.stats, indent=4))
def scrape_committee(self): """ Scrape all meeting reports for a particular committee. """ for (j, (date, title, href_report)) in enumerate(self.next_report): logger.debug("\t\t" + str(date) + " - " + (title[0:45]) if len(title) > 45 else title) tmp_url = href_report html = scrapertools.URLFetcher(tmp_url, self.session).html soup = BeautifulSoup(html) content = soup.find(id="content") bills = scrapertools.find_bills(str(content)) # only save report entries that can be tagged to bills if bills: self.current_report = { "entry_type": "committee-meeting", "bills": bills, "url": tmp_url, "date": date, "title": title, "agent": self.current_committee, } # report URL may have changed after editing on pmg.org.za, check for this possible_duplicates = Entry.query.filter(Entry.agent == self.current_committee)\ .filter(Entry.url != None)\ .filter(Entry.url != tmp_url)\ .filter(Entry.type == "committee-meeting")\ .filter(Entry.is_deleted == False)\ .filter(Entry.date == date)\ .order_by(Entry.entry_id).all() deletion_flag = False if possible_duplicates: logger.debug( str(len(possible_duplicates)) + " possible duplicates found") for possible_duplicate in possible_duplicates: redirect_url = scrapertools.URLFetcher( possible_duplicate.url, self.session).follow_redirect() if possible_duplicate.url != redirect_url: logger.debug('redirect encountered') if redirect_url == tmp_url: logger.info("Updating entry URL") # update the existing record's URL possible_duplicate.url = tmp_url # # delete all but one entry, if there are multiple duplicates # if deletion_flag: # logger.info('duplicate entry deleted') # possible_duplicate.is_deleted = True db.session.add(possible_duplicate) db.session.commit() deletion_flag = True if self.current_committee.location: self.current_report[ "location"] = self.current_committee.location try: self.add_or_update() except Exception, e: msg = "Could not add committee report to database: " if self.current_report.get("title"): msg += self.current_report["title"] self.stats["errors"].append(msg) logger.error(msg) logger.exception(str(e)) self.current_report = {} else: logger.debug('no bills found in committee meeting report')
def add_or_update(self): """ Add current_report to database, or update the record if it already exists. """ report = Entry.query.filter_by(agent_id=self.current_committee.agent_id) \ .filter_by(url=self.current_report['url'])\ .filter_by(is_deleted=False).first() if report is None: report = Entry() self.stats["new_committee_reports"] += 1 tmp_bills = None if self.current_report.get('bills'): tmp_bills = self.current_report['bills'] logger.info(str(tmp_bills)) report = scrapertools.populate_entry(report, self.current_report, tmp_bills) db.session.add(report) self.stats["total_committee_reports"] += 1 self.current_report = {} return if __name__ == "__main__": report_scraper = ReportScraper() report_scraper.run_scraper() logger.info(json.dumps(report_scraper.stats, indent=4))
def scrape_committee(self): """ Scrape all meeting reports for a particular committee. """ for (j, (date, title, href_report)) in enumerate(self.next_report): logger.debug("\t\t" + str(date) + " - " + (title[0:45]) if len(title) > 45 else title) tmp_url = href_report html = scrapertools.URLFetcher(tmp_url, self.session).html soup = BeautifulSoup(html) content = soup.find(id="content") bills = scrapertools.find_bills(str(content)) # only save report entries that can be tagged to bills if bills: self.current_report = { "entry_type": "committee-meeting", "bills": bills, "url": tmp_url, "date": date, "title": title, "agent": self.current_committee, } # report URL may have changed after editing on pmg.org.za, check for this possible_duplicates = Entry.query.filter(Entry.agent == self.current_committee)\ .filter(Entry.url != None)\ .filter(Entry.url != tmp_url)\ .filter(Entry.type == "committee-meeting")\ .filter(Entry.is_deleted == False)\ .filter(Entry.date == date)\ .order_by(Entry.entry_id).all() deletion_flag = False if possible_duplicates: logger.debug(str(len(possible_duplicates)) + " possible duplicates found") for possible_duplicate in possible_duplicates: redirect_url = scrapertools.URLFetcher(possible_duplicate.url, self.session).follow_redirect() if possible_duplicate.url != redirect_url: logger.debug('redirect encountered') if redirect_url == tmp_url: logger.info("Updating entry URL") # update the existing record's URL possible_duplicate.url = tmp_url # # delete all but one entry, if there are multiple duplicates # if deletion_flag: # logger.info('duplicate entry deleted') # possible_duplicate.is_deleted = True db.session.add(possible_duplicate) db.session.commit() deletion_flag = True if self.current_committee.location: self.current_report["location"] = self.current_committee.location try: self.add_or_update() except Exception, e: msg = "Could not add committee report to database: " if self.current_report.get("title"): msg += self.current_report["title"] self.stats["errors"].append(msg) logger.error(msg) logger.exception(str(e)) self.current_report = {} else: logger.debug('no bills found in committee meeting report')
raise # set entry_type appropriately if this bill has already been enacted if "as enacted" in link.text: version['entry_type'] = "act" versions.append(version) self.state_fn = self.version_state return True else: self.state_fn = self.header_state return False class Pager(object): """ Return an iterable containing URLs to each of the available bills pages. """ @property def next_page(self): current_year = datetime.today().year for current_year in range(current_year, 2005, -1): url = "http://www.pmg.org.za/print/bill?year=%d" % current_year yield url if __name__ == "__main__": bill_scraper = BillScraper() bill_scraper.run_scraper() logger.info(json.dumps(bill_scraper.stats, indent=4))
# set entry_type appropriately if this bill has already been enacted if "as enacted" in link.text: version['entry_type'] = "act" versions.append(version) self.state_fn = self.version_state return True else: self.state_fn = self.header_state return False class Pager(object): """ Return an iterable containing URLs to each of the available bills pages. """ @property def next_page(self): current_year = datetime.today().year for current_year in range(current_year, 2005, -1): url = "http://www.pmg.org.za/print/bill?year=%d" % current_year yield url if __name__ == "__main__": bill_scraper = BillScraper() bill_scraper.run_scraper() logger.info(json.dumps(bill_scraper.stats, indent=4))
def handle_assent(): """ Add entries relating to a bill's assent from http://pmg.org.za/billsstatus/proceedings, via the csv at /data/bill_assent_dates.csv """ with open("../data/bill_assent_dates.csv", 'Ur') as f: data = list(list(rec) for rec in csv.reader(f, delimiter=',')) president = Agent.query.filter(Agent.name == "The President").first() for i in range(len(data)): # ignore column title row if i==0: continue entry = data[i] # fix bill types if entry[0].startswith("PM"): entry[0] = "PMB" + entry[0][2::] elif not entry[0].startswith("B"): entry[0] = "B" + entry[0] tmp_code = entry[0] # clean bill code tmp = analyze_bill_code(tmp_code) if tmp: code = tmp["code"] else: logger.error("Error analyzing bill code " + tmp_code) continue logger.info(code + " " + str(entry)) bill = Bill.query.filter(Bill.code==code).first() if bill is None: logger.error("Error finding bill " + code) continue try: act_no = unicode(entry[1]) assent_date = unicode(entry[2]) # convert date to python date object try: assent_date = date_parser.parse(assent_date).date() except Exception: logger.error("Error parsing date " + entry[2]) continue if entry[3] and len(entry[3]) > 2: gazette = unicode(entry[3]) except UnicodeDecodeError: logger.error("Unicode error: " + str(entry)) continue # update bill record bill.status = "enacted" if gazette: bill.gazette = gazette db.session.add(bill) # add relevant entry in bill history tmp_entry = Entry.query.join(Entry.bills).filter(Bill.code==code).filter(Entry.type=="assent").first() if not tmp_entry: tmp_entry = Entry() tmp_entry.bills.append(bill) tmp_entry.date = assent_date tmp_entry.type = "assent" tmp_entry.location = 3 tmp_entry.title = "Signed into law by the President." tmp_entry.agent = president if act_no and gazette: tmp_entry.description = "Enacted as Act " + act_no + ". Refer to Government Gazette " + gazette + "." db.session.add(tmp_entry) db.session.commit() return
return def add_or_update(self): """ Add current_report to database, or update the record if it already exists. """ report = Entry.query.filter_by(agent_id=self.current_committee.agent_id) \ .filter_by(url=self.current_report['url'])\ .filter_by(is_deleted=False).first() if report is None: report = Entry() self.stats["new_committee_reports"] += 1 tmp_bills = None if self.current_report.get('bills'): tmp_bills = self.current_report['bills'] logger.info(str(tmp_bills)) report = scrapertools.populate_entry(report, self.current_report, tmp_bills) db.session.add(report) self.stats["total_committee_reports"] += 1 self.current_report = {} return if __name__ == "__main__": report_scraper = ReportScraper() report_scraper.run_scraper() logger.info(json.dumps(report_scraper.stats, indent=4))