def run_scraper(self):

        committees = Agent.query.filter(Agent.type == "committee").filter(
            Agent.url != None).all()
        shuffle(
            committees)  # randomize the order, just to keep things interesting
        for i, committee in enumerate(committees):
            self.current_committee = committee
            self.current_url = committee.url
            try:
                self.current_page = scrapertools.URLFetcher(
                    self.current_url, self.session).html
                logger.debug("Committee: " + str(committee.name))

                self.scrape_committee()
                # give some progress feedback
                logger.info(
                    str(i + 1) + " out of " + str(len(committees)) +
                    " committees' reports have been scraped.")
                logger.info(json.dumps(self.stats, indent=4))

                # commit entries to database, once per committee
                logger.debug("SAVING TO DATABASE")
                db.session.commit()
            except Exception as e:
                msg = "Error scraping committee's reports."
                self.stats["errors"].append(msg)
                logger.error(msg)
                logger.exception(str(e))
        return
    def run_scraper(self):

        committees = Agent.query.filter(Agent.type == "committee").filter(Agent.url != None).all()
        shuffle(committees)  # randomize the order, just to keep things interesting
        for i, committee in enumerate(committees):
            self.current_committee = committee
            self.current_url = committee.url
            try:
                self.current_page = scrapertools.URLFetcher(self.current_url, self.session).html
                logger.debug("Committee: " + str(committee.name))

                self.scrape_committee()
                # give some progress feedback
                logger.info(str(i + 1) + " out of " + str(len(committees)) + " committees' reports have been scraped.")
                logger.info(json.dumps(self.stats, indent=4))

                # commit entries to database, once per committee
                logger.debug("SAVING TO DATABASE")
                db.session.commit()
            except Exception as e:
                msg = "Error scraping committee's reports."
                self.stats["errors"].append(msg)
                logger.error(msg)
                logger.exception(str(e))
        return
Exemple #3
0
    def add_or_update(self):
        """
        Add current_bill to database, or update the record if it already exists.
        Then clear the current_bill attribute to make it ready for the next bill to be scraped.
        """

        bill_data = self.current_bill

        try:
            if self.current_bill.get('status') and self.current_bill['status'] == "Draft":
                # save scraped draft bill to database
                bill = Bill.query.filter(Bill.name==bill_data['bill_name']).filter(Bill.year==bill_data['year']).first()
                if bill is None:
                    bill = Bill()
                    bill.name = bill_data['bill_name']
                    bill.year = bill_data['year']
                    self.stats['new_drafts'] += 1
                bill.bill_type = "Draft"
                db.session.add(bill)
                self.stats['total_drafts'] += 1

            else:
                # save scraped bills to database
                bill_code = self.current_bill["code"]
                bill = Bill.query.filter(Bill.code==bill_code).first()
                if bill is None:
                    bill = Bill()
                    bill.code = bill_code
                    self.stats['new_bills'] += 1
                bill.name = bill_data['bill_name']
                bill.year = bill_data['year']
                bill.number = bill_data['number']
                db.session.add(bill)
                self.stats['total_bills'] += 1

            # save related bill versions
            for entry_data in bill_data['versions']:
                entry = Entry.query.filter(Entry.url==entry_data['url']).first()  # Look for pre-existing entry.
                if entry is None:
                    entry = Entry()  # Create new entry.
                    self.stats['new_bill_versions'] += 1
                entry = scrapertools.populate_entry(entry, entry_data)
                entry.bills.append(bill)
                db.session.add(entry)
                self.stats['total_bill_versions'] += 1

        except Exception:
            error_msg = "Error saving bill: "
            if self.current_bill.get('bill_name'):
                error_msg += self.current_bill['bill_name']
            if self.current_bill.get('versions'):
                error_msg += " - " + self.current_bill['versions'][0]['title']
            logger.error(error_msg)
            self.stats['errors'].append(error_msg)
            pass

        logger.debug(json.dumps(self.current_bill, indent=4, default=scrapertools.handler))
        self.current_bill = {}
        return
    def run_scraper(self):

        # define agents
        na = Agent.query.filter(Agent.name == "National Assembly").first()
        ncop = Agent.query.filter(
            Agent.name == "National Council Of Provinces").first()

        while True:
            for (j, (date, title,
                     href_hansard)) in enumerate(self.next_hansard):
                logger.debug("\t\t" + str(date) + " - " + title)
                tmp_url = href_hansard
                html = scrapertools.URLFetcher(tmp_url, self.session).html
                soup = BeautifulSoup(
                    html, convertEntities=BeautifulSoup.HTML_ENTITIES)
                content = soup.find(id="content")
                # find bills that are mentioned in the text
                bills = scrapertools.find_bills(str(content))
                # only save hansards that are related to bills
                if bills:
                    # infer location from title, where appropriate
                    location = None
                    agent = None
                    if title.startswith("NA:"):
                        location = 1
                        agent = na
                    elif title.startswith("NCOP:"):
                        location = 2
                        agent = ncop
                    self.current_hansard = {
                        "bills": bills,
                        "url": tmp_url,
                        "date": date,
                        "title": title,
                        "location": location
                    }
                    if agent:
                        self.current_hansard['agent'] = agent
                    try:
                        self.add_or_update()
                    except Exception:
                        msg = "Could not add hansard to database: "
                        if self.current_hansard.get("title"):
                            msg += self.current_hansard["title"]
                        self.stats["errors"].append(msg)
                        logger.error(msg)
                    self.current_hansard = {}

            # save hansards to database, once per scraped page
            db.session.commit()
            logger.debug(json.dumps(self.stats, indent=4))
            # test whether we have reached the end
            if not self.next_page:
                break
        return
Exemple #5
0
    def run_scraper(self):

        # define agents
        na = Agent.query.filter(Agent.name == "National Assembly").first()
        ncop = Agent.query.filter(Agent.name == "National Council Of Provinces").first()

        while True:
            for (j, (date, title, href_hansard)) in enumerate(self.next_hansard):
                logger.debug("\t\t" + str(date) + " - " + title)
                tmp_url = href_hansard
                html = scrapertools.URLFetcher(tmp_url, self.session).html
                soup = BeautifulSoup(html, convertEntities=BeautifulSoup.HTML_ENTITIES)
                content = soup.find(id="content")
                # find bills that are mentioned in the text
                bills = scrapertools.find_bills(str(content))
                # only save hansards that are related to bills
                if bills:
                    # infer location from title, where appropriate
                    location = None
                    agent = None
                    if title.startswith("NA:"):
                        location = 1
                        agent = na
                    elif title.startswith("NCOP:"):
                        location = 2
                        agent = ncop
                    self.current_hansard = {
                        "bills": bills,
                        "url": tmp_url,
                        "date": date,
                        "title": title,
                        "location": location
                    }
                    if agent:
                        self.current_hansard['agent'] = agent
                    try:
                        self.add_or_update()
                    except Exception:
                        msg = "Could not add hansard to database: "
                        if self.current_hansard.get("title"):
                            msg += self.current_hansard["title"]
                        self.stats["errors"].append(msg)
                        logger.error(msg)
                    self.current_hansard = {}

            # save hansards to database, once per scraped page
            db.session.commit()
            logger.debug(json.dumps(self.stats, indent=4))
            # test whether we have reached the end
            if not self.next_page:
                break
        return
Exemple #6
0
 def next_committee(self):
     html = scrapertools.URLFetcher("http://www.pmg.org.za/committees", self.session).html
     soup = BeautifulSoup(html, convertEntities=BeautifulSoup.HTML_ENTITIES)
     container = soup.find(id="committees-all")
     committee_lists = container.findAll("div", {"class": "item-list"})
     for committee_list in committee_lists:
         list_name = committee_list.find('h3').contents[0]
         logger.debug("\n" + list_name + ":")
         committees = committee_list.findAll('li')
         for committee in committees:
             href = "http://www.pmg.org.za" + committee.find('a').attrs[0][1]
             name = committee.find('a').contents[0]
             logger.debug("\t" + name)
             yield list_name, href, name
Exemple #7
0
    def version_state(self, fragment):
        """
        Extract available versions from second row.
        """
        link = fragment.find("a")
        # test whether the row contains a link to a bill version
        if link and not ("bills.pmg.org.za" in link["href"]
                         or "Bill Tracker" in link.text):
            versions = self.current_bill.setdefault("versions", [])
            url = link["href"]
            if not self.current_bill.get("code"):
                tmp = link.text
                info = scrapertools.analyze_bill_code(tmp)
                if info:
                    self.current_bill = dict(self.current_bill.items() +
                                             info.items())
                else:
                    logger.error("No bill found in string: " + tmp)

            try:
                version = {
                    "url":
                    link["href"],
                    "title":
                    link.text,
                    "date":
                    date_parser.parse(fragment.findAll("td")[1].text).date(),
                    "entry_type":
                    "bill-version",
                }
            except Exception as e:
                logger.debug(str(fragment))
                raise

            # set entry_type appropriately if this bill has already been enacted
            if "as enacted" in link.text:
                version['entry_type'] = "act"
            versions.append(version)
            self.state_fn = self.version_state
            return True
        else:
            self.state_fn = self.header_state
            return False
Exemple #8
0
    def version_state(self, fragment):
        """
        Extract available versions from second row.
        """
        link = fragment.find("a")
        # test whether the row contains a link to a bill version
        if link and not ("bills.pmg.org.za" in link["href"] or "Bill Tracker" in link.text):
            versions = self.current_bill.setdefault("versions", [])
            url = link["href"]
            if not self.current_bill.get("code"):
                tmp = link.text
                info = scrapertools.analyze_bill_code(tmp)
                if info:
                    self.current_bill = dict(self.current_bill.items() + info.items())
                else:
                    logger.error("No bill found in string: " + tmp)

            try:
                version = {
                    "url": link["href"],
                    "title": link.text,
                    "date": date_parser.parse(fragment.findAll("td")[1].text).date(),
                    "entry_type": "bill-version",
                    }
            except Exception as e:
                logger.debug(str(fragment))
                raise

            # set entry_type appropriately if this bill has already been enacted
            if "as enacted" in link.text:
                version['entry_type'] = "act"
            versions.append(version)
            self.state_fn = self.version_state
            return True
        else:
            self.state_fn = self.header_state
            return False
    def scrape_committee(self):
        """
        Scrape all meeting reports for a particular committee.
        """

        for (j, (date, title, href_report)) in enumerate(self.next_report):
            logger.debug("\t\t" + str(date) + " - " +
                         (title[0:45]) if len(title) > 45 else title)
            tmp_url = href_report
            html = scrapertools.URLFetcher(tmp_url, self.session).html
            soup = BeautifulSoup(html)
            content = soup.find(id="content")
            bills = scrapertools.find_bills(str(content))
            # only save report entries that can be tagged to bills
            if bills:
                self.current_report = {
                    "entry_type": "committee-meeting",
                    "bills": bills,
                    "url": tmp_url,
                    "date": date,
                    "title": title,
                    "agent": self.current_committee,
                }

                # report URL may have changed after editing on pmg.org.za, check for this
                possible_duplicates = Entry.query.filter(Entry.agent == self.current_committee)\
                    .filter(Entry.url != None)\
                    .filter(Entry.url != tmp_url)\
                    .filter(Entry.type == "committee-meeting")\
                    .filter(Entry.is_deleted == False)\
                    .filter(Entry.date == date)\
                    .order_by(Entry.entry_id).all()
                deletion_flag = False
                if possible_duplicates:
                    logger.debug(
                        str(len(possible_duplicates)) +
                        " possible duplicates found")
                    for possible_duplicate in possible_duplicates:
                        redirect_url = scrapertools.URLFetcher(
                            possible_duplicate.url,
                            self.session).follow_redirect()
                        if possible_duplicate.url != redirect_url:
                            logger.debug('redirect encountered')
                            if redirect_url == tmp_url:
                                logger.info("Updating entry URL")
                                # update the existing record's URL
                                possible_duplicate.url = tmp_url
                                # # delete all but one entry, if there are multiple duplicates
                                # if deletion_flag:
                                #     logger.info('duplicate entry deleted')
                                #     possible_duplicate.is_deleted = True
                                db.session.add(possible_duplicate)
                                db.session.commit()
                                deletion_flag = True

                if self.current_committee.location:
                    self.current_report[
                        "location"] = self.current_committee.location
                try:
                    self.add_or_update()
                except Exception, e:
                    msg = "Could not add committee report to database: "
                    if self.current_report.get("title"):
                        msg += self.current_report["title"]
                    self.stats["errors"].append(msg)
                    logger.error(msg)
                    logger.exception(str(e))
                self.current_report = {}
            else:
                logger.debug('no bills found in committee meeting report')
Exemple #10
0
    def add_or_update(self):
        """
        Add current_bill to database, or update the record if it already exists.
        Then clear the current_bill attribute to make it ready for the next bill to be scraped.
        """

        bill_data = self.current_bill

        try:
            if self.current_bill.get(
                    'status') and self.current_bill['status'] == "Draft":
                # save scraped draft bill to database
                bill = Bill.query.filter(
                    Bill.name == bill_data['bill_name']).filter(
                        Bill.year == bill_data['year']).first()
                if bill is None:
                    bill = Bill()
                    bill.name = bill_data['bill_name']
                    bill.year = bill_data['year']
                    self.stats['new_drafts'] += 1
                bill.bill_type = "Draft"
                db.session.add(bill)
                self.stats['total_drafts'] += 1

            else:
                # save scraped bills to database
                bill_code = self.current_bill["code"]
                bill = Bill.query.filter(Bill.code == bill_code).first()
                if bill is None:
                    bill = Bill()
                    bill.code = bill_code
                    self.stats['new_bills'] += 1
                bill.name = bill_data['bill_name']
                bill.year = bill_data['year']
                bill.number = bill_data['number']
                db.session.add(bill)
                self.stats['total_bills'] += 1

            # save related bill versions
            for entry_data in bill_data['versions']:
                entry = Entry.query.filter(
                    Entry.url ==
                    entry_data['url']).first()  # Look for pre-existing entry.
                if entry is None:
                    entry = Entry()  # Create new entry.
                    self.stats['new_bill_versions'] += 1
                entry = scrapertools.populate_entry(entry, entry_data)
                entry.bills.append(bill)
                db.session.add(entry)
                self.stats['total_bill_versions'] += 1

        except Exception:
            error_msg = "Error saving bill: "
            if self.current_bill.get('bill_name'):
                error_msg += self.current_bill['bill_name']
            if self.current_bill.get('versions'):
                error_msg += " - " + self.current_bill['versions'][0]['title']
            logger.error(error_msg)
            self.stats['errors'].append(error_msg)
            pass

        logger.debug(
            json.dumps(self.current_bill,
                       indent=4,
                       default=scrapertools.handler))
        self.current_bill = {}
        return
    def scrape_committee(self):
        """
        Scrape all meeting reports for a particular committee.
        """

        for (j, (date, title, href_report)) in enumerate(self.next_report):
            logger.debug("\t\t" + str(date) + " - " + (title[0:45]) if len(title) > 45 else title)
            tmp_url = href_report
            html = scrapertools.URLFetcher(tmp_url, self.session).html
            soup = BeautifulSoup(html)
            content = soup.find(id="content")
            bills = scrapertools.find_bills(str(content))
            # only save report entries that can be tagged to bills
            if bills:
                self.current_report = {
                    "entry_type": "committee-meeting",
                    "bills": bills,
                    "url": tmp_url,
                    "date": date,
                    "title": title,
                    "agent": self.current_committee,
                    }

                # report URL may have changed after editing on pmg.org.za, check for this
                possible_duplicates = Entry.query.filter(Entry.agent == self.current_committee)\
                    .filter(Entry.url != None)\
                    .filter(Entry.url != tmp_url)\
                    .filter(Entry.type == "committee-meeting")\
                    .filter(Entry.is_deleted == False)\
                    .filter(Entry.date == date)\
                    .order_by(Entry.entry_id).all()
                deletion_flag = False
                if possible_duplicates:
                    logger.debug(str(len(possible_duplicates)) + " possible duplicates found")
                    for possible_duplicate in possible_duplicates:
                        redirect_url = scrapertools.URLFetcher(possible_duplicate.url, self.session).follow_redirect()
                        if possible_duplicate.url != redirect_url:
                            logger.debug('redirect encountered')
                            if redirect_url == tmp_url:
                                logger.info("Updating entry URL")
                                # update the existing record's URL
                                possible_duplicate.url = tmp_url
                                # # delete all but one entry, if there are multiple duplicates
                                # if deletion_flag:
                                #     logger.info('duplicate entry deleted')
                                #     possible_duplicate.is_deleted = True
                                db.session.add(possible_duplicate)
                                db.session.commit()
                                deletion_flag = True

                if self.current_committee.location:
                    self.current_report["location"] = self.current_committee.location
                try:
                    self.add_or_update()
                except Exception, e:
                    msg = "Could not add committee report to database: "
                    if self.current_report.get("title"):
                        msg += self.current_report["title"]
                    self.stats["errors"].append(msg)
                    logger.error(msg)
                    logger.exception(str(e))
                self.current_report = {}
            else:
                logger.debug('no bills found in committee meeting report')