def run_scraper(self): committees = Agent.query.filter(Agent.type == "committee").filter( Agent.url != None).all() shuffle( committees) # randomize the order, just to keep things interesting for i, committee in enumerate(committees): self.current_committee = committee self.current_url = committee.url try: self.current_page = scrapertools.URLFetcher( self.current_url, self.session).html logger.debug("Committee: " + str(committee.name)) self.scrape_committee() # give some progress feedback logger.info( str(i + 1) + " out of " + str(len(committees)) + " committees' reports have been scraped.") logger.info(json.dumps(self.stats, indent=4)) # commit entries to database, once per committee logger.debug("SAVING TO DATABASE") db.session.commit() except Exception as e: msg = "Error scraping committee's reports." self.stats["errors"].append(msg) logger.error(msg) logger.exception(str(e)) return
def next_report(self): """ Iterate over the reports listed on a particular page. """ while True: soup = BeautifulSoup(self.current_page, convertEntities=BeautifulSoup.HTML_ENTITIES) reports_tab = soup.find(id="quicktabs_tabpage_committees_tabs_1") if reports_tab is None: logger.error("No reports tab for this committee: " + self.current_url) break table_body = reports_tab.find("tbody") if table_body: rows = table_body.findAll("tr") for row in rows: try: cells = row.findAll('td') date = date_parser.parse( cells[1].find('span').contents[0]).date() title = cells[2].find('a').contents[0] href = "http://www.pmg.org.za" + cells[2].find( 'a').attrs[0][1] yield date, title, href except Exception: msg = "Error reading committee report details from table row: " msg += self.current_url self.stats["errors"].append(msg) pass else: logger.error("No table body") if not self.next_page: break return
def next_report(self): """ Iterate over the reports listed on a particular page. """ while True: soup = BeautifulSoup(self.current_page, convertEntities=BeautifulSoup.HTML_ENTITIES) reports_tab = soup.find(id="quicktabs_tabpage_committees_tabs_1") if reports_tab is None: logger.error("No reports tab for this committee: " + self.current_url) break table_body = reports_tab.find("tbody") if table_body: rows = table_body.findAll("tr") for row in rows: try: cells = row.findAll('td') date = date_parser.parse(cells[1].find('span').contents[0]).date() title = cells[2].find('a').contents[0] href = "http://www.pmg.org.za" + cells[2].find('a').attrs[0][1] yield date, title, href except Exception: msg = "Error reading committee report details from table row: " msg += self.current_url self.stats["errors"].append(msg) pass else: logger.error("No table body") if not self.next_page: break return
def run_scraper(self): committees = Agent.query.filter(Agent.type == "committee").filter(Agent.url != None).all() shuffle(committees) # randomize the order, just to keep things interesting for i, committee in enumerate(committees): self.current_committee = committee self.current_url = committee.url try: self.current_page = scrapertools.URLFetcher(self.current_url, self.session).html logger.debug("Committee: " + str(committee.name)) self.scrape_committee() # give some progress feedback logger.info(str(i + 1) + " out of " + str(len(committees)) + " committees' reports have been scraped.") logger.info(json.dumps(self.stats, indent=4)) # commit entries to database, once per committee logger.debug("SAVING TO DATABASE") db.session.commit() except Exception as e: msg = "Error scraping committee's reports." self.stats["errors"].append(msg) logger.error(msg) logger.exception(str(e)) return
def add_or_update(self): """ Add current_bill to database, or update the record if it already exists. Then clear the current_bill attribute to make it ready for the next bill to be scraped. """ bill_data = self.current_bill try: if self.current_bill.get('status') and self.current_bill['status'] == "Draft": # save scraped draft bill to database bill = Bill.query.filter(Bill.name==bill_data['bill_name']).filter(Bill.year==bill_data['year']).first() if bill is None: bill = Bill() bill.name = bill_data['bill_name'] bill.year = bill_data['year'] self.stats['new_drafts'] += 1 bill.bill_type = "Draft" db.session.add(bill) self.stats['total_drafts'] += 1 else: # save scraped bills to database bill_code = self.current_bill["code"] bill = Bill.query.filter(Bill.code==bill_code).first() if bill is None: bill = Bill() bill.code = bill_code self.stats['new_bills'] += 1 bill.name = bill_data['bill_name'] bill.year = bill_data['year'] bill.number = bill_data['number'] db.session.add(bill) self.stats['total_bills'] += 1 # save related bill versions for entry_data in bill_data['versions']: entry = Entry.query.filter(Entry.url==entry_data['url']).first() # Look for pre-existing entry. if entry is None: entry = Entry() # Create new entry. self.stats['new_bill_versions'] += 1 entry = scrapertools.populate_entry(entry, entry_data) entry.bills.append(bill) db.session.add(entry) self.stats['total_bill_versions'] += 1 except Exception: error_msg = "Error saving bill: " if self.current_bill.get('bill_name'): error_msg += self.current_bill['bill_name'] if self.current_bill.get('versions'): error_msg += " - " + self.current_bill['versions'][0]['title'] logger.error(error_msg) self.stats['errors'].append(error_msg) pass logger.debug(json.dumps(self.current_bill, indent=4, default=scrapertools.handler)) self.current_bill = {} return
def run_scraper(self): # define agents na = Agent.query.filter(Agent.name == "National Assembly").first() ncop = Agent.query.filter( Agent.name == "National Council Of Provinces").first() while True: for (j, (date, title, href_hansard)) in enumerate(self.next_hansard): logger.debug("\t\t" + str(date) + " - " + title) tmp_url = href_hansard html = scrapertools.URLFetcher(tmp_url, self.session).html soup = BeautifulSoup( html, convertEntities=BeautifulSoup.HTML_ENTITIES) content = soup.find(id="content") # find bills that are mentioned in the text bills = scrapertools.find_bills(str(content)) # only save hansards that are related to bills if bills: # infer location from title, where appropriate location = None agent = None if title.startswith("NA:"): location = 1 agent = na elif title.startswith("NCOP:"): location = 2 agent = ncop self.current_hansard = { "bills": bills, "url": tmp_url, "date": date, "title": title, "location": location } if agent: self.current_hansard['agent'] = agent try: self.add_or_update() except Exception: msg = "Could not add hansard to database: " if self.current_hansard.get("title"): msg += self.current_hansard["title"] self.stats["errors"].append(msg) logger.error(msg) self.current_hansard = {} # save hansards to database, once per scraped page db.session.commit() logger.debug(json.dumps(self.stats, indent=4)) # test whether we have reached the end if not self.next_page: break return
def run_scraper(self): # define agents na = Agent.query.filter(Agent.name == "National Assembly").first() ncop = Agent.query.filter(Agent.name == "National Council Of Provinces").first() while True: for (j, (date, title, href_hansard)) in enumerate(self.next_hansard): logger.debug("\t\t" + str(date) + " - " + title) tmp_url = href_hansard html = scrapertools.URLFetcher(tmp_url, self.session).html soup = BeautifulSoup(html, convertEntities=BeautifulSoup.HTML_ENTITIES) content = soup.find(id="content") # find bills that are mentioned in the text bills = scrapertools.find_bills(str(content)) # only save hansards that are related to bills if bills: # infer location from title, where appropriate location = None agent = None if title.startswith("NA:"): location = 1 agent = na elif title.startswith("NCOP:"): location = 2 agent = ncop self.current_hansard = { "bills": bills, "url": tmp_url, "date": date, "title": title, "location": location } if agent: self.current_hansard['agent'] = agent try: self.add_or_update() except Exception: msg = "Could not add hansard to database: " if self.current_hansard.get("title"): msg += self.current_hansard["title"] self.stats["errors"].append(msg) logger.error(msg) self.current_hansard = {} # save hansards to database, once per scraped page db.session.commit() logger.debug(json.dumps(self.stats, indent=4)) # test whether we have reached the end if not self.next_page: break return
def version_state(self, fragment): """ Extract available versions from second row. """ link = fragment.find("a") # test whether the row contains a link to a bill version if link and not ("bills.pmg.org.za" in link["href"] or "Bill Tracker" in link.text): versions = self.current_bill.setdefault("versions", []) url = link["href"] if not self.current_bill.get("code"): tmp = link.text info = scrapertools.analyze_bill_code(tmp) if info: self.current_bill = dict(self.current_bill.items() + info.items()) else: logger.error("No bill found in string: " + tmp) try: version = { "url": link["href"], "title": link.text, "date": date_parser.parse(fragment.findAll("td")[1].text).date(), "entry_type": "bill-version", } except Exception as e: logger.debug(str(fragment)) raise # set entry_type appropriately if this bill has already been enacted if "as enacted" in link.text: version['entry_type'] = "act" versions.append(version) self.state_fn = self.version_state return True else: self.state_fn = self.header_state return False
def __init__(self): # login and start session with pmg website logger.info("LOGGING IN") self.session = requests.Session() headers = {'user-agent': 'Mozilla/4.0 (compatible: MSIE 6.0)'} try: data = { 'name': config['name'], 'pass': config['pass'], 'form_id': 'user_login', 'form_build_id': 'form-ee72095493d7ed912673b8a83219772c', 'op': 'Log in' } r = self.session.post('http://www.pmg.org.za/user/login', headers=headers, data=data) if not "Welcome back." in r.content: logger.error("Login was not successful") raise Exception except Exception as e: import traceback; traceback.print_exc() logger.error("Configuration Error:") logger.error("Please ensure that a file called 'scraper_config.json' exists in the scraper directory, and that it contains" \ "valid 'username' and 'password' parameters for logging in to the PMG website. This is needed for accessing " \ "much of the content.") raise e self.stats = {}
def run_scraper(self): """ Iterate through committees on http://www.pmg.org.za/committees, and scrape their details. """ for (i, (list_name, href_committee, name)) in enumerate(self.next_committee): # determine committee's location location = None if list_name == "National Assembly Committees": location = 1 elif list_name == "NCOP Committees": location = 2 elif list_name == "Joint Committees": location = 3 else: if "(NA)" in name: location = 1 elif "(NCOP)" in name: location = 2 # populate entry self.current_committee = { "type": "committee", "url": href_committee, "name": name, "location": location } try: self.add_or_update() except Exception: msg = "Could not add committee to database: " if self.current_committee.get("name"): msg += self.current_committee["name"] self.stats["errors"].append(msg) logger.error(msg) self.current_committee = {} db.session.commit() return
def handle_assent(): """ Add entries relating to a bill's assent from http://pmg.org.za/billsstatus/proceedings, via the csv at /data/bill_assent_dates.csv """ with open("../data/bill_assent_dates.csv", 'Ur') as f: data = list(list(rec) for rec in csv.reader(f, delimiter=',')) president = Agent.query.filter(Agent.name == "The President").first() for i in range(len(data)): # ignore column title row if i==0: continue entry = data[i] # fix bill types if entry[0].startswith("PM"): entry[0] = "PMB" + entry[0][2::] elif not entry[0].startswith("B"): entry[0] = "B" + entry[0] tmp_code = entry[0] # clean bill code tmp = analyze_bill_code(tmp_code) if tmp: code = tmp["code"] else: logger.error("Error analyzing bill code " + tmp_code) continue logger.info(code + " " + str(entry)) bill = Bill.query.filter(Bill.code==code).first() if bill is None: logger.error("Error finding bill " + code) continue try: act_no = unicode(entry[1]) assent_date = unicode(entry[2]) # convert date to python date object try: assent_date = date_parser.parse(assent_date).date() except Exception: logger.error("Error parsing date " + entry[2]) continue if entry[3] and len(entry[3]) > 2: gazette = unicode(entry[3]) except UnicodeDecodeError: logger.error("Unicode error: " + str(entry)) continue # update bill record bill.status = "enacted" if gazette: bill.gazette = gazette db.session.add(bill) # add relevant entry in bill history tmp_entry = Entry.query.join(Entry.bills).filter(Bill.code==code).filter(Entry.type=="assent").first() if not tmp_entry: tmp_entry = Entry() tmp_entry.bills.append(bill) tmp_entry.date = assent_date tmp_entry.type = "assent" tmp_entry.location = 3 tmp_entry.title = "Signed into law by the President." tmp_entry.agent = president if act_no and gazette: tmp_entry.description = "Enacted as Act " + act_no + ". Refer to Government Gazette " + gazette + "." db.session.add(tmp_entry) db.session.commit() return
def scrape_committee(self): """ Scrape all meeting reports for a particular committee. """ for (j, (date, title, href_report)) in enumerate(self.next_report): logger.debug("\t\t" + str(date) + " - " + (title[0:45]) if len(title) > 45 else title) tmp_url = href_report html = scrapertools.URLFetcher(tmp_url, self.session).html soup = BeautifulSoup(html) content = soup.find(id="content") bills = scrapertools.find_bills(str(content)) # only save report entries that can be tagged to bills if bills: self.current_report = { "entry_type": "committee-meeting", "bills": bills, "url": tmp_url, "date": date, "title": title, "agent": self.current_committee, } # report URL may have changed after editing on pmg.org.za, check for this possible_duplicates = Entry.query.filter(Entry.agent == self.current_committee)\ .filter(Entry.url != None)\ .filter(Entry.url != tmp_url)\ .filter(Entry.type == "committee-meeting")\ .filter(Entry.is_deleted == False)\ .filter(Entry.date == date)\ .order_by(Entry.entry_id).all() deletion_flag = False if possible_duplicates: logger.debug( str(len(possible_duplicates)) + " possible duplicates found") for possible_duplicate in possible_duplicates: redirect_url = scrapertools.URLFetcher( possible_duplicate.url, self.session).follow_redirect() if possible_duplicate.url != redirect_url: logger.debug('redirect encountered') if redirect_url == tmp_url: logger.info("Updating entry URL") # update the existing record's URL possible_duplicate.url = tmp_url # # delete all but one entry, if there are multiple duplicates # if deletion_flag: # logger.info('duplicate entry deleted') # possible_duplicate.is_deleted = True db.session.add(possible_duplicate) db.session.commit() deletion_flag = True if self.current_committee.location: self.current_report[ "location"] = self.current_committee.location try: self.add_or_update() except Exception, e: msg = "Could not add committee report to database: " if self.current_report.get("title"): msg += self.current_report["title"] self.stats["errors"].append(msg) logger.error(msg) logger.exception(str(e)) self.current_report = {} else: logger.debug('no bills found in committee meeting report')
def add_or_update(self): """ Add current_bill to database, or update the record if it already exists. Then clear the current_bill attribute to make it ready for the next bill to be scraped. """ bill_data = self.current_bill try: if self.current_bill.get( 'status') and self.current_bill['status'] == "Draft": # save scraped draft bill to database bill = Bill.query.filter( Bill.name == bill_data['bill_name']).filter( Bill.year == bill_data['year']).first() if bill is None: bill = Bill() bill.name = bill_data['bill_name'] bill.year = bill_data['year'] self.stats['new_drafts'] += 1 bill.bill_type = "Draft" db.session.add(bill) self.stats['total_drafts'] += 1 else: # save scraped bills to database bill_code = self.current_bill["code"] bill = Bill.query.filter(Bill.code == bill_code).first() if bill is None: bill = Bill() bill.code = bill_code self.stats['new_bills'] += 1 bill.name = bill_data['bill_name'] bill.year = bill_data['year'] bill.number = bill_data['number'] db.session.add(bill) self.stats['total_bills'] += 1 # save related bill versions for entry_data in bill_data['versions']: entry = Entry.query.filter( Entry.url == entry_data['url']).first() # Look for pre-existing entry. if entry is None: entry = Entry() # Create new entry. self.stats['new_bill_versions'] += 1 entry = scrapertools.populate_entry(entry, entry_data) entry.bills.append(bill) db.session.add(entry) self.stats['total_bill_versions'] += 1 except Exception: error_msg = "Error saving bill: " if self.current_bill.get('bill_name'): error_msg += self.current_bill['bill_name'] if self.current_bill.get('versions'): error_msg += " - " + self.current_bill['versions'][0]['title'] logger.error(error_msg) self.stats['errors'].append(error_msg) pass logger.debug( json.dumps(self.current_bill, indent=4, default=scrapertools.handler)) self.current_bill = {} return
def scrape_committee(self): """ Scrape all meeting reports for a particular committee. """ for (j, (date, title, href_report)) in enumerate(self.next_report): logger.debug("\t\t" + str(date) + " - " + (title[0:45]) if len(title) > 45 else title) tmp_url = href_report html = scrapertools.URLFetcher(tmp_url, self.session).html soup = BeautifulSoup(html) content = soup.find(id="content") bills = scrapertools.find_bills(str(content)) # only save report entries that can be tagged to bills if bills: self.current_report = { "entry_type": "committee-meeting", "bills": bills, "url": tmp_url, "date": date, "title": title, "agent": self.current_committee, } # report URL may have changed after editing on pmg.org.za, check for this possible_duplicates = Entry.query.filter(Entry.agent == self.current_committee)\ .filter(Entry.url != None)\ .filter(Entry.url != tmp_url)\ .filter(Entry.type == "committee-meeting")\ .filter(Entry.is_deleted == False)\ .filter(Entry.date == date)\ .order_by(Entry.entry_id).all() deletion_flag = False if possible_duplicates: logger.debug(str(len(possible_duplicates)) + " possible duplicates found") for possible_duplicate in possible_duplicates: redirect_url = scrapertools.URLFetcher(possible_duplicate.url, self.session).follow_redirect() if possible_duplicate.url != redirect_url: logger.debug('redirect encountered') if redirect_url == tmp_url: logger.info("Updating entry URL") # update the existing record's URL possible_duplicate.url = tmp_url # # delete all but one entry, if there are multiple duplicates # if deletion_flag: # logger.info('duplicate entry deleted') # possible_duplicate.is_deleted = True db.session.add(possible_duplicate) db.session.commit() deletion_flag = True if self.current_committee.location: self.current_report["location"] = self.current_committee.location try: self.add_or_update() except Exception, e: msg = "Could not add committee report to database: " if self.current_report.get("title"): msg += self.current_report["title"] self.stats["errors"].append(msg) logger.error(msg) logger.exception(str(e)) self.current_report = {} else: logger.debug('no bills found in committee meeting report')