Python UrlSource Examples, okscraper.sources.UrlSource Python Examples

Example #1

0

Show file

class LobbyistsIndexScraper(BaseScraper):
    """
    This scraper gets the list of lobbyist ids from the knesset lobbyists page html
    returns a list of lobbyist ids - doesn't store anything in db
    """
    LOBBYISTS_INDEX_PAGE_URL = 'http://www.knesset.gov.il/lobbyist/heb/lobbyist.aspx'

    def __init__(self):
        super(LobbyistsIndexScraper, self).__init__(self)
        self.source = UrlSource(self.LOBBYISTS_INDEX_PAGE_URL)
        self.storage = ListStorage()

    def _storeLobbyistIdsFromSoup(self, soup):
        elts = soup.findAll(lobbyist_id=True)
        counter = 0
        for elt in elts:
            lobbyist_id = elt.get('lobbyist_id')
            if lobbyist_id.isdigit():
                self.storage.store(lobbyist_id)
                self._getLogger().debug(lobbyist_id)
                counter = counter + 1
        self._getLogger().info('got %s lobbyists', str(counter))

    def _scrape(self):
        try:
            html = self.source.fetch()
            soup = BeautifulSoup(html)
        except Exception as e:
            send_chat_notification(
                __file__, 'failed to fetch or parse the lobbyists index page',
                {'url': self.LOBBYISTS_INDEX_PAGE_URL})
            raise e
        return self._storeLobbyistIdsFromSoup(soup)

Example #2

0

Show file

File: lobbyist.py Project: RobotnickIsrael/Open-Knesset

class LobbyistScraper(BaseScraper):
    """
    This scraper gets a lobbyist id, it then goes to the knesset api to get the data about the lobbyist
    """

    def __init__(self):
        super(LobbyistScraper, self).__init__()
        self.source = UrlSource('http://online.knesset.gov.il/WsinternetSps/KnessetDataService/LobbyistData.svc/View_lobbyist(<<id>>)')
        self.storage = LobbyistScraperDictStorage()

    def _storeLobbyistDataFromSoup(self, soup):
        lobbyist_id = soup.find('d:lobbyist_id').text.strip()
        self._getLogger().info('got lobbyist id "%s"', lobbyist_id)
        lobbyist = {
            'id': lobbyist_id,
            'first_name': soup.find('d:first_name').text.strip(),
            'family_name': soup.find('d:family_name').text.strip(),
            'profession': soup.find('d:profession').text.strip(),
            'corporation_name': soup.find('d:corporation_name').text.strip(),
            'corporation_id': soup.find('d:corporation_id').text.strip(),
            'faction_member': soup.find('d:faction_member').text.strip(),
            'faction_name': soup.find('d:faction_name').text.strip(),
            'permit_type': soup.find('d:lobyst_permit_type').text.strip(),
        }
        self.storage.storeDict(lobbyist)
        self._getLogger().debug(lobbyist)

    def _scrape(self, lobbyist_id):
        html = self.source.fetch(lobbyist_id)
        soup = BeautifulSoup(html)
        return self._storeLobbyistDataFromSoup(soup)

Example #3

0

Show file

File: lobbyists_index.py Project: MeirKriheli/Open-Knesset

class LobbyistsIndexScraper(BaseScraper):
    """
    This scraper gets the list of lobbyist ids from the knesset lobbyists page html
    returns a list of lobbyist ids - doesn't store anything in db
    """
    LOBBYISTS_INDEX_PAGE_URL = 'http://www.knesset.gov.il/lobbyist/heb/lobbyist.aspx'

    def __init__(self):
        super(LobbyistsIndexScraper, self).__init__(self)
        self.source = UrlSource(self.LOBBYISTS_INDEX_PAGE_URL)
        self.storage = ListStorage()

    def _storeLobbyistIdsFromSoup(self, soup):
        elts = soup.findAll(lobbyist_id=True)
        counter = 0
        for elt in elts:
            lobbyist_id = elt.get('lobbyist_id')
            if lobbyist_id.isdigit():
                self.storage.store(lobbyist_id)
                self._getLogger().debug(lobbyist_id)
                counter = counter + 1
        self._getLogger().info('got %s lobbyists', str(counter))

    def _scrape(self):
        try:
            html = self.source.fetch()
            soup = BeautifulSoup(html)
        except Exception as e:
            send_chat_notification(__file__, 'failed to fetch or parse the lobbyists index page', {'url': self.LOBBYISTS_INDEX_PAGE_URL})
            raise e
        return self._storeLobbyistIdsFromSoup(soup)

Example #4

0

Show file

File: lobbyists_index.py Project: JoeyHa/Open-Knesset

class LobbyistsIndexScraper(BaseScraper):
    """
    This scraper gets the list of lobbyist ids from the knesset lobbyists page html
    returns a list of lobbyist ids - doesn't store anything in db
    """

    def __init__(self):
        super(LobbyistsIndexScraper, self).__init__(self)
        self.source = UrlSource('http://www.knesset.gov.il/lobbyist/heb/lobbyist.aspx')
        self.storage = ListStorage()

    def _storeLobbyistIdsFromSoup(self, soup):
        elts = soup.findAll(lobbyist_id=True)
        counter = 0
        for elt in elts:
            lobbyist_id = elt.get('lobbyist_id')
            if lobbyist_id.isdigit():
                self.storage.store(lobbyist_id)
                self._getLogger().debug(lobbyist_id)
                counter = counter + 1
        self._getLogger().info('got %s lobbyists', str(counter))

    def _scrape(self):
        html = self.source.fetch()
        soup = BeautifulSoup(html)
        return self._storeLobbyistIdsFromSoup(soup)

Example #5

0

Show file

class LobbyistScraper(BaseScraper):
    """
    This scraper gets a lobbyist id, it then goes to the knesset api to get the data about the lobbyist
    """
    def __init__(self):
        super(LobbyistScraper, self).__init__()
        self.source = UrlSource(
            'http://online.knesset.gov.il/WsinternetSps/KnessetDataService/LobbyistData.svc/View_lobbyist(<<id>>)'
        )
        self.storage = LobbyistScraperDictStorage()

    def _storeLobbyistDataFromSoup(self, soup):
        lobbyist_id = soup.find('d:lobbyist_id').text.strip()
        self._getLogger().info('got lobbyist id "%s"', lobbyist_id)
        lobbyist = {
            'id': lobbyist_id,
            'first_name': soup.find('d:first_name').text.strip(),
            'family_name': soup.find('d:family_name').text.strip(),
            'profession': soup.find('d:profession').text.strip(),
            'corporation_name': soup.find('d:corporation_name').text.strip(),
            'corporation_id': soup.find('d:corporation_id').text.strip(),
            'faction_member': soup.find('d:faction_member').text.strip(),
            'faction_name': soup.find('d:faction_name').text.strip(),
            'permit_type': soup.find('d:lobyst_permit_type').text.strip(),
        }
        self.storage.storeDict(lobbyist)
        self._getLogger().debug(lobbyist)

    def _scrape(self, lobbyist_id):
        html = self.source.fetch(lobbyist_id)
        soup = BeautifulSoup(html)
        return self._storeLobbyistDataFromSoup(soup)

Example #6

0

Show file

class LobbyistsIndexScraper(BaseScraper):
    """
    This scraper gets the list of lobbyist ids from the knesset lobbyists page html
    returns a list of lobbyist ids - doesn't store anything in db
    """
    def __init__(self):
        super(LobbyistsIndexScraper, self).__init__(self)
        self.source = UrlSource(
            'http://www.knesset.gov.il/lobbyist/heb/lobbyist.aspx')
        self.storage = ListStorage()

    def _storeLobbyistIdsFromSoup(self, soup):
        elts = soup.findAll(lobbyist_id=True)
        counter = 0
        for elt in elts:
            lobbyist_id = elt.get('lobbyist_id')
            if lobbyist_id.isdigit():
                self.storage.store(lobbyist_id)
                self._getLogger().debug(lobbyist_id)
                counter = counter + 1
        self._getLogger().info('got %s lobbyists', str(counter))

    def _scrape(self):
        html = self.source.fetch()
        soup = BeautifulSoup(html)
        return self._storeLobbyistIdsFromSoup(soup)

Example #7

0

Show file

class LobbyistRepresentScraper(BaseScraper):
    """
    This scraper gets a lobbyist id and returns a list of LobbyistRepresent objects for that lobbyist 
    """
    def __init__(self):
        super(LobbyistRepresentScraper, self).__init__(self)
        self.source = UrlSource(
            'http://online.knesset.gov.il/WsinternetSps/KnessetDataService/LobbyistData.svc/View_lobbyist(<<id>>)/lobbist_type'
        )
        self.storage = LobbyistRepresentListStorage()

    def _storeLobbyistRepresentDataFromSoup(self, soup, lobbyist_id):
        self._getLogger().info('got lobbyist represent (lobbyist id "%s")',
                               lobbyist_id)
        for elt in soup.findAll('content'):
            represent = {}
            represent['id'] = elt.find('d:lobbyist_represent_id').text.strip()
            represent['lobbyist_id'] = elt.find('d:lobbyist_id').text.strip()
            represent['name'] = elt.find(
                'd:lobbyist_represent_name').text.strip()
            represent['domain'] = elt.find(
                'd:lobbyist_represent_domain').text.strip()
            represent['type'] = elt.find(
                'd:lobbyist_represent_type').text.strip()
            self._getLogger().debug(represent)
            self.storage.store(represent)

    def _scrape(self, lobbyist_id):
        html = self.source.fetch(lobbyist_id)
        soup = BeautifulSoup(html)
        return self._storeLobbyistRepresentDataFromSoup(soup, lobbyist_id)

Example #8

0

Show file

File: lobbyist_represent.py Project: JoeyHa/Open-Knesset

class LobbyistRepresentScraper(BaseScraper):
    """
    This scraper gets a lobbyist id and returns a list of LobbyistRepresent objects for that lobbyist 
    """

    def __init__(self):
        super(LobbyistRepresentScraper, self).__init__(self)
        self.source = UrlSource('http://online.knesset.gov.il/WsinternetSps/KnessetDataService/LobbyistData.svc/View_lobbyist(<<id>>)/lobbist_type')
        self.storage = LobbyistRepresentListStorage()

    def _storeLobbyistRepresentDataFromSoup(self, soup, lobbyist_id):
        self._getLogger().info('got lobbyist represent (lobbyist id "%s")', lobbyist_id)
        for elt in soup.findAll('content'):
            represent = {}
            represent['id'] = elt.find('d:lobbyist_represent_id').text.strip()
            represent['lobbyist_id'] = elt.find('d:lobbyist_id').text.strip()
            represent['name'] = elt.find('d:lobbyist_represent_name').text.strip()
            represent['domain'] = elt.find('d:lobbyist_represent_domain').text.strip()
            represent['type'] = elt.find('d:lobbyist_represent_type').text.strip()
            self._getLogger().debug(represent)
            self.storage.store(represent)

    def _scrape(self, lobbyist_id):
        html = self.source.fetch(lobbyist_id)
        soup = BeautifulSoup(html)
        return self._storeLobbyistRepresentDataFromSoup(soup, lobbyist_id)

Example #9

0

Show file

File: lobbyist.py Project: RobotnickIsrael/Open-Knesset

 def __init__(self):
     super(LobbyistScraper, self).__init__()
     self.source = UrlSource('http://online.knesset.gov.il/WsinternetSps/KnessetDataService/LobbyistData.svc/View_lobbyist(<<id>>)')
     self.storage = LobbyistScraperDictStorage()

Example #10

0

Show file

File: lobbyists_index.py Project: MeirKriheli/Open-Knesset

 def __init__(self):
     super(LobbyistsIndexScraper, self).__init__(self)
     self.source = UrlSource(self.LOBBYISTS_INDEX_PAGE_URL)
     self.storage = ListStorage()

Example #11

0

Show file

File: lobbyists_index.py Project: JoeyHa/Open-Knesset

 def __init__(self):
     super(LobbyistsIndexScraper, self).__init__(self)
     self.source = UrlSource('http://www.knesset.gov.il/lobbyist/heb/lobbyist.aspx')
     self.storage = ListStorage()

Example #12

0

Show file

 def __init__(self):
     super(LobbyistScraper, self).__init__()
     self.source = UrlSource(
         'http://online.knesset.gov.il/WsinternetSps/KnessetDataService/LobbyistData.svc/View_lobbyist(<<id>>)'
     )
     self.storage = LobbyistScraperDictStorage()

Example #13

0

Show file

 def __init__(self):
     super(LobbyistsIndexScraper, self).__init__(self)
     self.source = UrlSource(self.LOBBYISTS_INDEX_PAGE_URL)
     self.storage = ListStorage()

Example #14

0

Show file

 def __init__(self):
     super(LobbyistsIndexScraper, self).__init__(self)
     self.source = UrlSource(
         'http://www.knesset.gov.il/lobbyist/heb/lobbyist.aspx')
     self.storage = ListStorage()