def run_scraper(self): # define agents na = Agent.query.filter(Agent.name == "National Assembly").first() ncop = Agent.query.filter( Agent.name == "National Council Of Provinces").first() while True: for (j, (date, title, href_hansard)) in enumerate(self.next_hansard): logger.debug("\t\t" + str(date) + " - " + title) tmp_url = href_hansard html = scrapertools.URLFetcher(tmp_url, self.session).html soup = BeautifulSoup( html, convertEntities=BeautifulSoup.HTML_ENTITIES) content = soup.find(id="content") # find bills that are mentioned in the text bills = scrapertools.find_bills(str(content)) # only save hansards that are related to bills if bills: # infer location from title, where appropriate location = None agent = None if title.startswith("NA:"): location = 1 agent = na elif title.startswith("NCOP:"): location = 2 agent = ncop self.current_hansard = { "bills": bills, "url": tmp_url, "date": date, "title": title, "location": location } if agent: self.current_hansard['agent'] = agent try: self.add_or_update() except Exception: msg = "Could not add hansard to database: " if self.current_hansard.get("title"): msg += self.current_hansard["title"] self.stats["errors"].append(msg) logger.error(msg) self.current_hansard = {} # save hansards to database, once per scraped page db.session.commit() logger.debug(json.dumps(self.stats, indent=4)) # test whether we have reached the end if not self.next_page: break return
def run_scraper(self): # define agents na = Agent.query.filter(Agent.name == "National Assembly").first() ncop = Agent.query.filter(Agent.name == "National Council Of Provinces").first() while True: for (j, (date, title, href_hansard)) in enumerate(self.next_hansard): logger.debug("\t\t" + str(date) + " - " + title) tmp_url = href_hansard html = scrapertools.URLFetcher(tmp_url, self.session).html soup = BeautifulSoup(html, convertEntities=BeautifulSoup.HTML_ENTITIES) content = soup.find(id="content") # find bills that are mentioned in the text bills = scrapertools.find_bills(str(content)) # only save hansards that are related to bills if bills: # infer location from title, where appropriate location = None agent = None if title.startswith("NA:"): location = 1 agent = na elif title.startswith("NCOP:"): location = 2 agent = ncop self.current_hansard = { "bills": bills, "url": tmp_url, "date": date, "title": title, "location": location } if agent: self.current_hansard['agent'] = agent try: self.add_or_update() except Exception: msg = "Could not add hansard to database: " if self.current_hansard.get("title"): msg += self.current_hansard["title"] self.stats["errors"].append(msg) logger.error(msg) self.current_hansard = {} # save hansards to database, once per scraped page db.session.commit() logger.debug(json.dumps(self.stats, indent=4)) # test whether we have reached the end if not self.next_page: break return
def scrape_committee(self): """ Scrape all meeting reports for a particular committee. """ for (j, (date, title, href_report)) in enumerate(self.next_report): logger.debug("\t\t" + str(date) + " - " + (title[0:45]) if len(title) > 45 else title) tmp_url = href_report html = scrapertools.URLFetcher(tmp_url, self.session).html soup = BeautifulSoup(html) content = soup.find(id="content") bills = scrapertools.find_bills(str(content)) # only save report entries that can be tagged to bills if bills: self.current_report = { "entry_type": "committee-meeting", "bills": bills, "url": tmp_url, "date": date, "title": title, "agent": self.current_committee, } # report URL may have changed after editing on pmg.org.za, check for this possible_duplicates = Entry.query.filter(Entry.agent == self.current_committee)\ .filter(Entry.url != None)\ .filter(Entry.url != tmp_url)\ .filter(Entry.type == "committee-meeting")\ .filter(Entry.is_deleted == False)\ .filter(Entry.date == date)\ .order_by(Entry.entry_id).all() deletion_flag = False if possible_duplicates: logger.debug( str(len(possible_duplicates)) + " possible duplicates found") for possible_duplicate in possible_duplicates: redirect_url = scrapertools.URLFetcher( possible_duplicate.url, self.session).follow_redirect() if possible_duplicate.url != redirect_url: logger.debug('redirect encountered') if redirect_url == tmp_url: logger.info("Updating entry URL") # update the existing record's URL possible_duplicate.url = tmp_url # # delete all but one entry, if there are multiple duplicates # if deletion_flag: # logger.info('duplicate entry deleted') # possible_duplicate.is_deleted = True db.session.add(possible_duplicate) db.session.commit() deletion_flag = True if self.current_committee.location: self.current_report[ "location"] = self.current_committee.location try: self.add_or_update() except Exception, e: msg = "Could not add committee report to database: " if self.current_report.get("title"): msg += self.current_report["title"] self.stats["errors"].append(msg) logger.error(msg) logger.exception(str(e)) self.current_report = {} else: logger.debug('no bills found in committee meeting report')
def scrape_committee(self): """ Scrape all meeting reports for a particular committee. """ for (j, (date, title, href_report)) in enumerate(self.next_report): logger.debug("\t\t" + str(date) + " - " + (title[0:45]) if len(title) > 45 else title) tmp_url = href_report html = scrapertools.URLFetcher(tmp_url, self.session).html soup = BeautifulSoup(html) content = soup.find(id="content") bills = scrapertools.find_bills(str(content)) # only save report entries that can be tagged to bills if bills: self.current_report = { "entry_type": "committee-meeting", "bills": bills, "url": tmp_url, "date": date, "title": title, "agent": self.current_committee, } # report URL may have changed after editing on pmg.org.za, check for this possible_duplicates = Entry.query.filter(Entry.agent == self.current_committee)\ .filter(Entry.url != None)\ .filter(Entry.url != tmp_url)\ .filter(Entry.type == "committee-meeting")\ .filter(Entry.is_deleted == False)\ .filter(Entry.date == date)\ .order_by(Entry.entry_id).all() deletion_flag = False if possible_duplicates: logger.debug(str(len(possible_duplicates)) + " possible duplicates found") for possible_duplicate in possible_duplicates: redirect_url = scrapertools.URLFetcher(possible_duplicate.url, self.session).follow_redirect() if possible_duplicate.url != redirect_url: logger.debug('redirect encountered') if redirect_url == tmp_url: logger.info("Updating entry URL") # update the existing record's URL possible_duplicate.url = tmp_url # # delete all but one entry, if there are multiple duplicates # if deletion_flag: # logger.info('duplicate entry deleted') # possible_duplicate.is_deleted = True db.session.add(possible_duplicate) db.session.commit() deletion_flag = True if self.current_committee.location: self.current_report["location"] = self.current_committee.location try: self.add_or_update() except Exception, e: msg = "Could not add committee report to database: " if self.current_report.get("title"): msg += self.current_report["title"] self.stats["errors"].append(msg) logger.error(msg) logger.exception(str(e)) self.current_report = {} else: logger.debug('no bills found in committee meeting report')