def run_scraper(self): committees = Agent.query.filter(Agent.type == "committee").filter( Agent.url != None).all() shuffle( committees) # randomize the order, just to keep things interesting for i, committee in enumerate(committees): self.current_committee = committee self.current_url = committee.url try: self.current_page = scrapertools.URLFetcher( self.current_url, self.session).html logger.debug("Committee: " + str(committee.name)) self.scrape_committee() # give some progress feedback logger.info( str(i + 1) + " out of " + str(len(committees)) + " committees' reports have been scraped.") logger.info(json.dumps(self.stats, indent=4)) # commit entries to database, once per committee logger.debug("SAVING TO DATABASE") db.session.commit() except Exception as e: msg = "Error scraping committee's reports." self.stats["errors"].append(msg) logger.error(msg) logger.exception(str(e)) return
def run_scraper(self): committees = Agent.query.filter(Agent.type == "committee").filter(Agent.url != None).all() shuffle(committees) # randomize the order, just to keep things interesting for i, committee in enumerate(committees): self.current_committee = committee self.current_url = committee.url try: self.current_page = scrapertools.URLFetcher(self.current_url, self.session).html logger.debug("Committee: " + str(committee.name)) self.scrape_committee() # give some progress feedback logger.info(str(i + 1) + " out of " + str(len(committees)) + " committees' reports have been scraped.") logger.info(json.dumps(self.stats, indent=4)) # commit entries to database, once per committee logger.debug("SAVING TO DATABASE") db.session.commit() except Exception as e: msg = "Error scraping committee's reports." self.stats["errors"].append(msg) logger.error(msg) logger.exception(str(e)) return
def add_or_update(self): """ Add current_bill to database, or update the record if it already exists. Then clear the current_bill attribute to make it ready for the next bill to be scraped. """ bill_data = self.current_bill try: if self.current_bill.get('status') and self.current_bill['status'] == "Draft": # save scraped draft bill to database bill = Bill.query.filter(Bill.name==bill_data['bill_name']).filter(Bill.year==bill_data['year']).first() if bill is None: bill = Bill() bill.name = bill_data['bill_name'] bill.year = bill_data['year'] self.stats['new_drafts'] += 1 bill.bill_type = "Draft" db.session.add(bill) self.stats['total_drafts'] += 1 else: # save scraped bills to database bill_code = self.current_bill["code"] bill = Bill.query.filter(Bill.code==bill_code).first() if bill is None: bill = Bill() bill.code = bill_code self.stats['new_bills'] += 1 bill.name = bill_data['bill_name'] bill.year = bill_data['year'] bill.number = bill_data['number'] db.session.add(bill) self.stats['total_bills'] += 1 # save related bill versions for entry_data in bill_data['versions']: entry = Entry.query.filter(Entry.url==entry_data['url']).first() # Look for pre-existing entry. if entry is None: entry = Entry() # Create new entry. self.stats['new_bill_versions'] += 1 entry = scrapertools.populate_entry(entry, entry_data) entry.bills.append(bill) db.session.add(entry) self.stats['total_bill_versions'] += 1 except Exception: error_msg = "Error saving bill: " if self.current_bill.get('bill_name'): error_msg += self.current_bill['bill_name'] if self.current_bill.get('versions'): error_msg += " - " + self.current_bill['versions'][0]['title'] logger.error(error_msg) self.stats['errors'].append(error_msg) pass logger.debug(json.dumps(self.current_bill, indent=4, default=scrapertools.handler)) self.current_bill = {} return
def run_scraper(self): # define agents na = Agent.query.filter(Agent.name == "National Assembly").first() ncop = Agent.query.filter( Agent.name == "National Council Of Provinces").first() while True: for (j, (date, title, href_hansard)) in enumerate(self.next_hansard): logger.debug("\t\t" + str(date) + " - " + title) tmp_url = href_hansard html = scrapertools.URLFetcher(tmp_url, self.session).html soup = BeautifulSoup( html, convertEntities=BeautifulSoup.HTML_ENTITIES) content = soup.find(id="content") # find bills that are mentioned in the text bills = scrapertools.find_bills(str(content)) # only save hansards that are related to bills if bills: # infer location from title, where appropriate location = None agent = None if title.startswith("NA:"): location = 1 agent = na elif title.startswith("NCOP:"): location = 2 agent = ncop self.current_hansard = { "bills": bills, "url": tmp_url, "date": date, "title": title, "location": location } if agent: self.current_hansard['agent'] = agent try: self.add_or_update() except Exception: msg = "Could not add hansard to database: " if self.current_hansard.get("title"): msg += self.current_hansard["title"] self.stats["errors"].append(msg) logger.error(msg) self.current_hansard = {} # save hansards to database, once per scraped page db.session.commit() logger.debug(json.dumps(self.stats, indent=4)) # test whether we have reached the end if not self.next_page: break return
def run_scraper(self): # define agents na = Agent.query.filter(Agent.name == "National Assembly").first() ncop = Agent.query.filter(Agent.name == "National Council Of Provinces").first() while True: for (j, (date, title, href_hansard)) in enumerate(self.next_hansard): logger.debug("\t\t" + str(date) + " - " + title) tmp_url = href_hansard html = scrapertools.URLFetcher(tmp_url, self.session).html soup = BeautifulSoup(html, convertEntities=BeautifulSoup.HTML_ENTITIES) content = soup.find(id="content") # find bills that are mentioned in the text bills = scrapertools.find_bills(str(content)) # only save hansards that are related to bills if bills: # infer location from title, where appropriate location = None agent = None if title.startswith("NA:"): location = 1 agent = na elif title.startswith("NCOP:"): location = 2 agent = ncop self.current_hansard = { "bills": bills, "url": tmp_url, "date": date, "title": title, "location": location } if agent: self.current_hansard['agent'] = agent try: self.add_or_update() except Exception: msg = "Could not add hansard to database: " if self.current_hansard.get("title"): msg += self.current_hansard["title"] self.stats["errors"].append(msg) logger.error(msg) self.current_hansard = {} # save hansards to database, once per scraped page db.session.commit() logger.debug(json.dumps(self.stats, indent=4)) # test whether we have reached the end if not self.next_page: break return
def next_committee(self): html = scrapertools.URLFetcher("http://www.pmg.org.za/committees", self.session).html soup = BeautifulSoup(html, convertEntities=BeautifulSoup.HTML_ENTITIES) container = soup.find(id="committees-all") committee_lists = container.findAll("div", {"class": "item-list"}) for committee_list in committee_lists: list_name = committee_list.find('h3').contents[0] logger.debug("\n" + list_name + ":") committees = committee_list.findAll('li') for committee in committees: href = "http://www.pmg.org.za" + committee.find('a').attrs[0][1] name = committee.find('a').contents[0] logger.debug("\t" + name) yield list_name, href, name
def version_state(self, fragment): """ Extract available versions from second row. """ link = fragment.find("a") # test whether the row contains a link to a bill version if link and not ("bills.pmg.org.za" in link["href"] or "Bill Tracker" in link.text): versions = self.current_bill.setdefault("versions", []) url = link["href"] if not self.current_bill.get("code"): tmp = link.text info = scrapertools.analyze_bill_code(tmp) if info: self.current_bill = dict(self.current_bill.items() + info.items()) else: logger.error("No bill found in string: " + tmp) try: version = { "url": link["href"], "title": link.text, "date": date_parser.parse(fragment.findAll("td")[1].text).date(), "entry_type": "bill-version", } except Exception as e: logger.debug(str(fragment)) raise # set entry_type appropriately if this bill has already been enacted if "as enacted" in link.text: version['entry_type'] = "act" versions.append(version) self.state_fn = self.version_state return True else: self.state_fn = self.header_state return False
def scrape_committee(self): """ Scrape all meeting reports for a particular committee. """ for (j, (date, title, href_report)) in enumerate(self.next_report): logger.debug("\t\t" + str(date) + " - " + (title[0:45]) if len(title) > 45 else title) tmp_url = href_report html = scrapertools.URLFetcher(tmp_url, self.session).html soup = BeautifulSoup(html) content = soup.find(id="content") bills = scrapertools.find_bills(str(content)) # only save report entries that can be tagged to bills if bills: self.current_report = { "entry_type": "committee-meeting", "bills": bills, "url": tmp_url, "date": date, "title": title, "agent": self.current_committee, } # report URL may have changed after editing on pmg.org.za, check for this possible_duplicates = Entry.query.filter(Entry.agent == self.current_committee)\ .filter(Entry.url != None)\ .filter(Entry.url != tmp_url)\ .filter(Entry.type == "committee-meeting")\ .filter(Entry.is_deleted == False)\ .filter(Entry.date == date)\ .order_by(Entry.entry_id).all() deletion_flag = False if possible_duplicates: logger.debug( str(len(possible_duplicates)) + " possible duplicates found") for possible_duplicate in possible_duplicates: redirect_url = scrapertools.URLFetcher( possible_duplicate.url, self.session).follow_redirect() if possible_duplicate.url != redirect_url: logger.debug('redirect encountered') if redirect_url == tmp_url: logger.info("Updating entry URL") # update the existing record's URL possible_duplicate.url = tmp_url # # delete all but one entry, if there are multiple duplicates # if deletion_flag: # logger.info('duplicate entry deleted') # possible_duplicate.is_deleted = True db.session.add(possible_duplicate) db.session.commit() deletion_flag = True if self.current_committee.location: self.current_report[ "location"] = self.current_committee.location try: self.add_or_update() except Exception, e: msg = "Could not add committee report to database: " if self.current_report.get("title"): msg += self.current_report["title"] self.stats["errors"].append(msg) logger.error(msg) logger.exception(str(e)) self.current_report = {} else: logger.debug('no bills found in committee meeting report')
def add_or_update(self): """ Add current_bill to database, or update the record if it already exists. Then clear the current_bill attribute to make it ready for the next bill to be scraped. """ bill_data = self.current_bill try: if self.current_bill.get( 'status') and self.current_bill['status'] == "Draft": # save scraped draft bill to database bill = Bill.query.filter( Bill.name == bill_data['bill_name']).filter( Bill.year == bill_data['year']).first() if bill is None: bill = Bill() bill.name = bill_data['bill_name'] bill.year = bill_data['year'] self.stats['new_drafts'] += 1 bill.bill_type = "Draft" db.session.add(bill) self.stats['total_drafts'] += 1 else: # save scraped bills to database bill_code = self.current_bill["code"] bill = Bill.query.filter(Bill.code == bill_code).first() if bill is None: bill = Bill() bill.code = bill_code self.stats['new_bills'] += 1 bill.name = bill_data['bill_name'] bill.year = bill_data['year'] bill.number = bill_data['number'] db.session.add(bill) self.stats['total_bills'] += 1 # save related bill versions for entry_data in bill_data['versions']: entry = Entry.query.filter( Entry.url == entry_data['url']).first() # Look for pre-existing entry. if entry is None: entry = Entry() # Create new entry. self.stats['new_bill_versions'] += 1 entry = scrapertools.populate_entry(entry, entry_data) entry.bills.append(bill) db.session.add(entry) self.stats['total_bill_versions'] += 1 except Exception: error_msg = "Error saving bill: " if self.current_bill.get('bill_name'): error_msg += self.current_bill['bill_name'] if self.current_bill.get('versions'): error_msg += " - " + self.current_bill['versions'][0]['title'] logger.error(error_msg) self.stats['errors'].append(error_msg) pass logger.debug( json.dumps(self.current_bill, indent=4, default=scrapertools.handler)) self.current_bill = {} return
def scrape_committee(self): """ Scrape all meeting reports for a particular committee. """ for (j, (date, title, href_report)) in enumerate(self.next_report): logger.debug("\t\t" + str(date) + " - " + (title[0:45]) if len(title) > 45 else title) tmp_url = href_report html = scrapertools.URLFetcher(tmp_url, self.session).html soup = BeautifulSoup(html) content = soup.find(id="content") bills = scrapertools.find_bills(str(content)) # only save report entries that can be tagged to bills if bills: self.current_report = { "entry_type": "committee-meeting", "bills": bills, "url": tmp_url, "date": date, "title": title, "agent": self.current_committee, } # report URL may have changed after editing on pmg.org.za, check for this possible_duplicates = Entry.query.filter(Entry.agent == self.current_committee)\ .filter(Entry.url != None)\ .filter(Entry.url != tmp_url)\ .filter(Entry.type == "committee-meeting")\ .filter(Entry.is_deleted == False)\ .filter(Entry.date == date)\ .order_by(Entry.entry_id).all() deletion_flag = False if possible_duplicates: logger.debug(str(len(possible_duplicates)) + " possible duplicates found") for possible_duplicate in possible_duplicates: redirect_url = scrapertools.URLFetcher(possible_duplicate.url, self.session).follow_redirect() if possible_duplicate.url != redirect_url: logger.debug('redirect encountered') if redirect_url == tmp_url: logger.info("Updating entry URL") # update the existing record's URL possible_duplicate.url = tmp_url # # delete all but one entry, if there are multiple duplicates # if deletion_flag: # logger.info('duplicate entry deleted') # possible_duplicate.is_deleted = True db.session.add(possible_duplicate) db.session.commit() deletion_flag = True if self.current_committee.location: self.current_report["location"] = self.current_committee.location try: self.add_or_update() except Exception, e: msg = "Could not add committee report to database: " if self.current_report.get("title"): msg += self.current_report["title"] self.stats["errors"].append(msg) logger.error(msg) logger.exception(str(e)) self.current_report = {} else: logger.debug('no bills found in committee meeting report')