def __init__(self, session): self.session = session self.current_url = "http://www.pmg.org.za/hansard" self.current_page = scrapertools.URLFetcher(self.current_url, self.session).html self.current_hansard = {} self.stats = {"total_hansards": 0, "new_hansards": 0, "errors": []}
def run_scraper(self): committees = Agent.query.filter(Agent.type == "committee").filter( Agent.url != None).all() shuffle( committees) # randomize the order, just to keep things interesting for i, committee in enumerate(committees): self.current_committee = committee self.current_url = committee.url try: self.current_page = scrapertools.URLFetcher( self.current_url, self.session).html logger.debug("Committee: " + str(committee.name)) self.scrape_committee() # give some progress feedback logger.info( str(i + 1) + " out of " + str(len(committees)) + " committees' reports have been scraped.") logger.info(json.dumps(self.stats, indent=4)) # commit entries to database, once per committee logger.debug("SAVING TO DATABASE") db.session.commit() except Exception as e: msg = "Error scraping committee's reports." self.stats["errors"].append(msg) logger.error(msg) logger.exception(str(e)) return
def run_scraper(self): """ Iterate through bill pages, and run the state machine for each page. """ pager = Pager() # iterate through bill pages for url in pager.next_page: logger.info(url) # initiate parser for this page self.state_fn = self.start_state html = scrapertools.URLFetcher(url, self.session).html soup = BeautifulSoup(html, convertEntities=BeautifulSoup.HTML_ENTITIES) table = soup.find("tbody") rows = table.findAll("tr") # feed rows into state machine for row in rows: while not self.state_fn(row): pass # commit to database after each page # db.session.commit() return
def run_scraper(self): # define agents na = Agent.query.filter(Agent.name == "National Assembly").first() ncop = Agent.query.filter( Agent.name == "National Council Of Provinces").first() while True: for (j, (date, title, href_hansard)) in enumerate(self.next_hansard): logger.debug("\t\t" + str(date) + " - " + title) tmp_url = href_hansard html = scrapertools.URLFetcher(tmp_url, self.session).html soup = BeautifulSoup( html, convertEntities=BeautifulSoup.HTML_ENTITIES) content = soup.find(id="content") # find bills that are mentioned in the text bills = scrapertools.find_bills(str(content)) # only save hansards that are related to bills if bills: # infer location from title, where appropriate location = None agent = None if title.startswith("NA:"): location = 1 agent = na elif title.startswith("NCOP:"): location = 2 agent = ncop self.current_hansard = { "bills": bills, "url": tmp_url, "date": date, "title": title, "location": location } if agent: self.current_hansard['agent'] = agent try: self.add_or_update() except Exception: msg = "Could not add hansard to database: " if self.current_hansard.get("title"): msg += self.current_hansard["title"] self.stats["errors"].append(msg) logger.error(msg) self.current_hansard = {} # save hansards to database, once per scraped page db.session.commit() logger.debug(json.dumps(self.stats, indent=4)) # test whether we have reached the end if not self.next_page: break return
def next_page(self): soup = BeautifulSoup(self.current_page) next_link = soup.find("li", {"class": "pager-next"}) if next_link: href = "http://www.pmg.org.za" + next_link.find('a').attrs[0][1] self.current_url = href self.current_page = scrapertools.URLFetcher( self.current_url, self.session).html return True return False
def next_page(self): """ Extract the 'next' link, if there is one. """ soup = BeautifulSoup(self.current_page) reports_tab = soup.find(id="quicktabs_tabpage_committees_tabs_1") next_link = reports_tab.find("li", {"class": "pager-next"}) if next_link: href = "http://www.pmg.org.za" + next_link.find('a').attrs[0][1] self.current_url = href self.current_page = scrapertools.URLFetcher( self.current_url, self.session).html return True return False
def next_committee(self): html = scrapertools.URLFetcher("http://www.pmg.org.za/committees", self.session).html soup = BeautifulSoup(html, convertEntities=BeautifulSoup.HTML_ENTITIES) container = soup.find(id="committees-all") committee_lists = container.findAll("div", {"class": "item-list"}) for committee_list in committee_lists: list_name = committee_list.find('h3').contents[0] logger.debug("\n" + list_name + ":") committees = committee_list.findAll('li') for committee in committees: href = "http://www.pmg.org.za" + committee.find('a').attrs[0][1] name = committee.find('a').contents[0] logger.debug("\t" + name) yield list_name, href, name
def scrape_committee(self): """ Scrape all meeting reports for a particular committee. """ for (j, (date, title, href_report)) in enumerate(self.next_report): logger.debug("\t\t" + str(date) + " - " + (title[0:45]) if len(title) > 45 else title) tmp_url = href_report html = scrapertools.URLFetcher(tmp_url, self.session).html soup = BeautifulSoup(html) content = soup.find(id="content") bills = scrapertools.find_bills(str(content)) # only save report entries that can be tagged to bills if bills: self.current_report = { "entry_type": "committee-meeting", "bills": bills, "url": tmp_url, "date": date, "title": title, "agent": self.current_committee, } # report URL may have changed after editing on pmg.org.za, check for this possible_duplicates = Entry.query.filter(Entry.agent == self.current_committee)\ .filter(Entry.url != None)\ .filter(Entry.url != tmp_url)\ .filter(Entry.type == "committee-meeting")\ .filter(Entry.is_deleted == False)\ .filter(Entry.date == date)\ .order_by(Entry.entry_id).all() deletion_flag = False if possible_duplicates: logger.debug( str(len(possible_duplicates)) + " possible duplicates found") for possible_duplicate in possible_duplicates: redirect_url = scrapertools.URLFetcher( possible_duplicate.url, self.session).follow_redirect() if possible_duplicate.url != redirect_url: logger.debug('redirect encountered') if redirect_url == tmp_url: logger.info("Updating entry URL") # update the existing record's URL possible_duplicate.url = tmp_url # # delete all but one entry, if there are multiple duplicates # if deletion_flag: # logger.info('duplicate entry deleted') # possible_duplicate.is_deleted = True db.session.add(possible_duplicate) db.session.commit() deletion_flag = True if self.current_committee.location: self.current_report[ "location"] = self.current_committee.location try: self.add_or_update() except Exception, e: msg = "Could not add committee report to database: " if self.current_report.get("title"): msg += self.current_report["title"] self.stats["errors"].append(msg) logger.error(msg) logger.exception(str(e)) self.current_report = {} else: logger.debug('no bills found in committee meeting report')