def run_scraper(self): committees = Agent.query.filter(Agent.type == "committee").filter( Agent.url != None).all() shuffle( committees) # randomize the order, just to keep things interesting for i, committee in enumerate(committees): self.current_committee = committee self.current_url = committee.url try: self.current_page = scrapertools.URLFetcher( self.current_url, self.session).html logger.debug("Committee: " + str(committee.name)) self.scrape_committee() # give some progress feedback logger.info( str(i + 1) + " out of " + str(len(committees)) + " committees' reports have been scraped.") logger.info(json.dumps(self.stats, indent=4)) # commit entries to database, once per committee logger.debug("SAVING TO DATABASE") db.session.commit() except Exception as e: msg = "Error scraping committee's reports." self.stats["errors"].append(msg) logger.error(msg) logger.exception(str(e)) return
def run_scraper(self): committees = Agent.query.filter(Agent.type == "committee").filter(Agent.url != None).all() shuffle(committees) # randomize the order, just to keep things interesting for i, committee in enumerate(committees): self.current_committee = committee self.current_url = committee.url try: self.current_page = scrapertools.URLFetcher(self.current_url, self.session).html logger.debug("Committee: " + str(committee.name)) self.scrape_committee() # give some progress feedback logger.info(str(i + 1) + " out of " + str(len(committees)) + " committees' reports have been scraped.") logger.info(json.dumps(self.stats, indent=4)) # commit entries to database, once per committee logger.debug("SAVING TO DATABASE") db.session.commit() except Exception as e: msg = "Error scraping committee's reports." self.stats["errors"].append(msg) logger.error(msg) logger.exception(str(e)) return
def scrape_committee(self): """ Scrape all meeting reports for a particular committee. """ for (j, (date, title, href_report)) in enumerate(self.next_report): logger.debug("\t\t" + str(date) + " - " + (title[0:45]) if len(title) > 45 else title) tmp_url = href_report html = scrapertools.URLFetcher(tmp_url, self.session).html soup = BeautifulSoup(html) content = soup.find(id="content") bills = scrapertools.find_bills(str(content)) # only save report entries that can be tagged to bills if bills: self.current_report = { "entry_type": "committee-meeting", "bills": bills, "url": tmp_url, "date": date, "title": title, "agent": self.current_committee, } # report URL may have changed after editing on pmg.org.za, check for this possible_duplicates = Entry.query.filter(Entry.agent == self.current_committee)\ .filter(Entry.url != None)\ .filter(Entry.url != tmp_url)\ .filter(Entry.type == "committee-meeting")\ .filter(Entry.is_deleted == False)\ .filter(Entry.date == date)\ .order_by(Entry.entry_id).all() deletion_flag = False if possible_duplicates: logger.debug( str(len(possible_duplicates)) + " possible duplicates found") for possible_duplicate in possible_duplicates: redirect_url = scrapertools.URLFetcher( possible_duplicate.url, self.session).follow_redirect() if possible_duplicate.url != redirect_url: logger.debug('redirect encountered') if redirect_url == tmp_url: logger.info("Updating entry URL") # update the existing record's URL possible_duplicate.url = tmp_url # # delete all but one entry, if there are multiple duplicates # if deletion_flag: # logger.info('duplicate entry deleted') # possible_duplicate.is_deleted = True db.session.add(possible_duplicate) db.session.commit() deletion_flag = True if self.current_committee.location: self.current_report[ "location"] = self.current_committee.location try: self.add_or_update() except Exception, e: msg = "Could not add committee report to database: " if self.current_report.get("title"): msg += self.current_report["title"] self.stats["errors"].append(msg) logger.error(msg) logger.exception(str(e)) self.current_report = {} else: logger.debug('no bills found in committee meeting report')
def scrape_committee(self): """ Scrape all meeting reports for a particular committee. """ for (j, (date, title, href_report)) in enumerate(self.next_report): logger.debug("\t\t" + str(date) + " - " + (title[0:45]) if len(title) > 45 else title) tmp_url = href_report html = scrapertools.URLFetcher(tmp_url, self.session).html soup = BeautifulSoup(html) content = soup.find(id="content") bills = scrapertools.find_bills(str(content)) # only save report entries that can be tagged to bills if bills: self.current_report = { "entry_type": "committee-meeting", "bills": bills, "url": tmp_url, "date": date, "title": title, "agent": self.current_committee, } # report URL may have changed after editing on pmg.org.za, check for this possible_duplicates = Entry.query.filter(Entry.agent == self.current_committee)\ .filter(Entry.url != None)\ .filter(Entry.url != tmp_url)\ .filter(Entry.type == "committee-meeting")\ .filter(Entry.is_deleted == False)\ .filter(Entry.date == date)\ .order_by(Entry.entry_id).all() deletion_flag = False if possible_duplicates: logger.debug(str(len(possible_duplicates)) + " possible duplicates found") for possible_duplicate in possible_duplicates: redirect_url = scrapertools.URLFetcher(possible_duplicate.url, self.session).follow_redirect() if possible_duplicate.url != redirect_url: logger.debug('redirect encountered') if redirect_url == tmp_url: logger.info("Updating entry URL") # update the existing record's URL possible_duplicate.url = tmp_url # # delete all but one entry, if there are multiple duplicates # if deletion_flag: # logger.info('duplicate entry deleted') # possible_duplicate.is_deleted = True db.session.add(possible_duplicate) db.session.commit() deletion_flag = True if self.current_committee.location: self.current_report["location"] = self.current_committee.location try: self.add_or_update() except Exception, e: msg = "Could not add committee report to database: " if self.current_report.get("title"): msg += self.current_report["title"] self.stats["errors"].append(msg) logger.error(msg) logger.exception(str(e)) self.current_report = {} else: logger.debug('no bills found in committee meeting report')