Example #1
0
    def scrape(self):
        page = self.lxmlize(COUNCIL_PAGE, 'iso-8859-1')

        councillors = page.xpath('//div[@id="PageContent"]/table/tbody/tr/td')
        assert len(councillors), 'No councillors found'
        for councillor in councillors:
            if not councillor.text_content().strip():
                continue
            if councillor == councillors[0]:
                district = 'Kirkland'
                role = 'Maire'
            else:
                district = councillor.xpath('.//h2')[0].text_content()
                district = re.search('- (.+)', district).group(1).strip()
                district = district.replace(' Ouest',
                                            ' ouest').replace(' Est', ' est')
                role = 'Conseiller'

            name = councillor.xpath('.//strong/text()')[0]

            phone = councillor.xpath(
                './/div[contains(text(), "#")]/text()')[0].replace(
                    'T ', '').replace(' ', '-').replace(',-#-', ' x')
            email = self.get_email(councillor)

            p = Person(primary_org='legislature',
                       name=name,
                       district=district,
                       role=role)
            p.add_source(COUNCIL_PAGE)
            p.add_contact('voice', phone, 'legislature')
            p.add_contact('email', email)
            p.image = councillor.xpath('.//img/@src')[0]
            yield p
Example #2
0
    def scrape(self):
        page = self.lxmlize(COUNCIL_PAGE)

        councillors = page.xpath('//div[@class="field-item even"]//tr')
        assert len(councillors), 'No councillors found'
        for councillor in councillors:
            district = councillor.xpath('./td[1]//strong/text()')[0].replace(
                'no. ', '')
            role = 'Conseiller'
            if 'Maire' in district:
                district = 'Senneville'
                role = 'Maire'
            name = councillor.xpath('./td[2]//p//text()')[0].title()
            email = self.get_email(councillor)
            p = Person(primary_org='legislature',
                       name=name,
                       district=district,
                       role=role)
            p.add_source(COUNCIL_PAGE)
            try:
                p.image = councillor.xpath('.//img/@src')[0]
            except IndexError:
                pass
            p.add_contact('email', email)
            yield p
Example #3
0
    def scrape(self):
        self.user_agent = CUSTOM_USER_AGENT
        page = self.get(COUNCIL_PAGE)
        members = re.findall('/Members/YourMember/[^"]+', page.text)
        assert len(members), 'No members found'
        for member in members:
            detail_url = 'http://www.assembly.nl.ca%s' % member
            detail = self.lxmlize(detail_url, user_agent=CUSTOM_USER_AGENT)

            name = detail.xpath('//h1/text()')[0]
            district = re.sub(r' [\xa0–-] ', '—', detail.xpath('//h2/text()')[0])  # # n-dash, m-dash
            party = PARTIES[detail.xpath('//h3/text()')[0]]

            p = Person(primary_org='legislature', name=name, district=district, role='MHA', party=party)
            p.image = detail.xpath('//img[@class="img-responsive"]/@src')[0]

            contact = detail.xpath('//div[@class="col-md-12"]')[0]
            p.add_contact('email', self.get_email(contact))

            p.add_source(COUNCIL_PAGE)
            p.add_source(detail_url)

            for heading, _type in HEADING_TYPE.items():
                node = detail.xpath('//b[.="%s"]/../..' % heading)
                if node:
                    phone = self.get_phone(node[0], error=False)
                    if phone:
                        p.add_contact('voice', phone, _type)

            yield p
Example #4
0
    def scrape(self):
        councillor_seat_number = 1

        contact_page = self.lxmlize(CONTACT_URL)
        email = self.get_email(contact_page)

        page = self.lxmlize(COUNCIL_PAGE)
        urls = page.xpath('//a/@href[contains(., "members/")]')
        assert len(urls), 'No councillors found'
        for url in urls:
            page = self.lxmlize(url)
            role, name = page.xpath('//h1//text()')[0].split(' ', 1)
            photo_url = page.xpath('//div[@id="content"]//img/@src')[0]

            if role == 'Mayor':
                district = 'Richmond'
            else:
                district = 'Richmond (seat {})'.format(councillor_seat_number)
                councillor_seat_number += 1

            p = Person(primary_org='legislature',
                       name=name,
                       district=district,
                       role=role)
            p.image = photo_url
            p.add_source(COUNCIL_PAGE)
            p.add_source(CONTACT_URL)
            p.add_source(url)
            p.add_contact('email', email)  # same for all
            yield p
Example #5
0
    def scrape(self):
        page = self.lxmlize(COUNCIL_PAGE)

        mayor_info = page.xpath('//h2[contains(text(), "MAYOR")]//following-sibling::p')[0]
        yield self.scrape_mayor(mayor_info)

        wards = page.xpath('//h3')
        for ward in wards:
            district = re.sub('\AWARD \d+ - ', '', ward.text_content())
            councillors = ward.xpath('following-sibling::p')
            for councillor in councillors:
                name = councillor.xpath('./strong')[0].text_content()

                p = Person(primary_org='legislature', name=name, district=district, role='Councillor')
                p.add_source(COUNCIL_PAGE)

                info = councillor.xpath('./text()')
                address = info.pop(0)
                p.add_contact('address', address, 'legislature')

                # get phone numbers
                for line in info:
                    stuff = re.split(r'(\xbb)|(\xa0)', line)
                    tmp = [y for y in stuff if y and not re.match(r'\xa0', y)]
                    self.get_tel_numbers(tmp, p)

                email = self.get_email(councillor)
                p.add_contact('email', email)

                yield p
                if councillor == councillors[1]:
                    break
Example #6
0
    def scrape(self):
        councillor_seat_number = 1

        page = self.lxmlize(COUNCIL_PAGE)
        nodes = page.xpath('//div[@class="view-content"]/div')
        for node in nodes:
            fields = node.xpath('./div')
            role = fields[0].xpath('./div//text()')[0]
            name = fields[2].xpath('.//a//text()')[0].title().split(role)[-1].strip()
            if name == 'Vacant':
                continue

            if 'Ward' in role:
                district = role
                role = 'Councillor'
            else:
                if 'At Large' in role:
                    role = 'Councillor at Large'
                    district = "St. John's (seat {})".format(councillor_seat_number)
                    councillor_seat_number += 1
                else:
                    district = "St. John's"
            phone = fields[3].xpath('./div//text()')[0]
            email = self.get_email(fields[5])
            photo_url = node.xpath('.//img/@src')[0]

            p = Person(primary_org='legislature', name=name, district=district, role=role)
            p.add_source(COUNCIL_PAGE)
            p.add_contact('voice', phone, 'legislature')
            p.add_contact('email', email)
            p.image = photo_url
            yield p
Example #7
0
    def scrape(self):
        page = self.lxmlize(COUNCIL_PAGE)

        councillor_trs = [tr for tr in page.xpath('//table//tr[1]') if len(tr) == 2][:-1]
        for councillor_tr in councillor_trs:
            desc = [text.strip() for text in councillor_tr.xpath('.//text()[normalize-space()]') if text.strip()]

            if len(desc) == 3:
                role = 'Maire'
                district = 'Saint-Jérôme'
            else:
                role = 'Conseiller'
                district = desc[0].replace('numéro ', '')

            name = desc[-3]
            phone = desc[-2]
            email = desc[-1]

            image = councillor_tr.xpath('.//img/@src')[0]

            p = Person(primary_org='legislature', name=name, district=district, role=role)
            p.add_source(COUNCIL_PAGE)
            p.image = image
            p.add_contact('voice', phone, 'legislature')
            p.add_contact('email', email)
            yield p
Example #8
0
    def scrape(self):
        page = self.lxmlize(COUNCIL_PAGE)

        councillors = page.xpath(
            '//div[contains(@class, "councillorwrapper")]')
        assert len(councillors), 'No councillors found'
        for index, councillor in enumerate(councillors):
            name = councillor.xpath('.//h4/text()')[0]
            district = councillor.xpath('.//h4/span/text()')[0].strip()
            role = 'Councillor'
            email = None

            if not district and index == 0:
                district = 'Calgary'
                role = 'Mayor'
                email = '*****@*****.**'

            p = Person(primary_org='legislature',
                       name=name,
                       district=district,
                       role=role)
            p.image = councillor.xpath('.//@src')[0]
            if email:
                p.add_contact('email', email)
            p.add_source(COUNCIL_PAGE)
            yield p
Example #9
0
    def scrape(self):
        page = self.lxmlize(COUNCIL_PAGE)

        councillors = page.xpath('//table[@id="Table1table"]/tbody/tr')
        assert len(councillors), 'No councillors found'
        for i, councillor in enumerate(councillors):
            role_district = councillor.xpath('./td[2]/p/text()')[0].strip()
            if 'Mayor' in role_district:
                name = role_district.replace('Mayor and Regional Councillor',
                                             '')
                role = 'Mayor'
                district = 'Milton'
            else:
                name = councillor.xpath('./td[2]/p/text()')[1]
                role, district = re.split(r' (?=Ward)', role_district)
                if role == 'Town and Regional Councillor':
                    role = 'Regional Councillor'
                elif role == 'Town Councillor':
                    role = 'Councillor'

            p = Person(primary_org='legislature',
                       name=name,
                       district=district,
                       role=role)
            p.add_source(COUNCIL_PAGE)

            p.image = councillor.xpath('./td[1]/p//img/@src')[0]

            numbers = councillor.xpath('./td[3]/p[2]/text()')
            for number in numbers:
                num_type, number = number.split(':')
                number = number.replace(', ext ', ' x').strip()
                p.add_contact(num_type, number, num_type)

            yield p
Example #10
0
    def scrape(self):
        page = self.lxmlize(COUNCIL_PAGE)

        corrections = {
            'Mackenzie Delta': 'Mackenzie-Delta',
            'Tu Nedhe - Wiilideh': 'Tu Nedhe',
        }

        member_cells = page.xpath('//div[@class="views-field views-field-field-picture"]/parent::td')
        for cell in member_cells:
            name = cell[1].text_content().replace(' .', '. ')  # typo on page
            riding = cell[2].text_content().strip()
            riding = corrections.get(riding, riding)

            detail_url = cell[0].xpath('.//a/@href')[0]
            detail_page = self.lxmlize(detail_url)
            photo_url = detail_page.xpath('//div[@class="field-item even"]/img/@src')[0]
            email = self.get_email(detail_page)

            contact_text = ''.join(detail_page.xpath('//div[@property="content:encoded"]/p[1]//text()'))
            phone = re.search(r'P(hone)?: ([-0-9]+)', contact_text)

            p = Person(primary_org='legislature', name=name, district=riding, role='MLA', image=photo_url)
            p.add_source(COUNCIL_PAGE)
            p.add_source(detail_url)
            p.add_contact('email', email)
            if phone:
                p.add_contact('voice', phone.group(2), 'legislature')
            yield p
Example #11
0
    def scrape(self):
        page = self.lxmlize(COUNCIL_PAGE)

        # it's all javascript rendered on the client... wow.
        js = page.xpath('string(//div[@class="inner_container"]/div/script[2])')  # allow string()
        districts = re.findall(r'arrayDistricts\[a.+"(.+)"', js)
        names = re.findall(r'arrayMembres\[a.+"(.+)"', js)
        urls = re.findall(r'arrayLiens\[a.+"(.+)"', js)
        # first item in list is mayor
        p = Person(primary_org='legislature', name=names[0], district='Gatineau', role='Maire')
        p.add_source(COUNCIL_PAGE)
        p.add_source(MAYOR_CONTACT_PAGE)
        email = '*****@*****.**'  # hardcoded
        p.add_contact('email', email)
        yield p

        for raw_district, name, url in list(zip(districts, names, urls))[1:]:
            if name == 'Vacant':
                continue

            profile_url = COUNCIL_PAGE + '/' + url.split('/')[-1]
            profile_page = self.lxmlize(profile_url)
            photo_url = profile_page.xpath('//img/@src')[0]
            district = 'District ' + re.search('\d+', raw_district).group(0)
            email = self.get_email(profile_page)
            p = Person(primary_org='legislature', name=name, district=district, role='Conseiller')
            p.add_source(COUNCIL_PAGE)
            p.add_source(profile_url)
            p.image = photo_url
            p.add_contact('email', email)
            yield p
Example #12
0
    def scrape(self):
        page = self.lxmlize(COUNCIL_PAGE)
        members = page.xpath('//table[1]//tr')

        assert len(members), 'No members found'
        for member in members:
            if not member.text_content().strip():
                continue

            name = member.xpath('./td[2]//a[1]//text()')[0]

            district_name = member.xpath('./td[2]//a[contains(.//text(), "MLA")]//text()')[0].split(':')[1].replace('St ', 'St. ').split('-')
            district = district_name[0].strip() + '-' + district_name[1].strip()
            url = member.xpath('./td[2]//a[1]/@href')[0]
            ext_infos = self.scrape_extended_info(url)
            p = Person(primary_org='legislature', name=name, district=district, role='MLA')
            p.add_source(COUNCIL_PAGE)
            p.add_source(url)

            if ext_infos:  # member pages might return errors
                email, phone, photo_url = ext_infos
                p.image = photo_url
                if email:
                    p.add_contact('email', email)
                if phone:
                    p.add_contact('voice', phone, 'legislature')
            yield p
Example #13
0
    def scrape(self):
        page = self.lxmlize(COUNCIL_PAGE)

        councillors = page.xpath('//div[@id="large_content"]//td/p[2]')
        assert len(councillors), 'No councillors found'
        for councillor in councillors:
            info = councillor.xpath('./strong/text()')

            # In case the name spans on 2 lines
            if len(info) > 2 and 'Councillor' not in info[1]:
                role, district = info[2].split('-')
                info = [info[0] + info[1], role, district]

            name = info[0]

            if 'Vacant' not in info:
                if len(info) < 3:
                    district = 'Dorval'
                    role = 'Maire'
                else:
                    district = info[2]
                    role = 'Conseiller'
                p = Person(primary_org='legislature', name=name, district=district, role=role)
                p.add_source(COUNCIL_PAGE)

                p.image = councillor.xpath('./preceding-sibling::p/img/@src')[0]

                email = self.get_email(councillor)
                p.add_contact('email', email)

                yield p
Example #14
0
    def scrape(self):
        page = self.lxmlize(COUNCIL_PAGE)

        mayor = page.xpath('.//div[@class="item-page clearfix"]//table[1]//p')[1]
        name = mayor.xpath('.//strong/text()')[0]

        p = Person(primary_org='legislature', name=name, district='Pointe-Claire', role='Maire')
        p.add_source(COUNCIL_PAGE)

        phone = re.findall(r'[0-9]{3}[ -][0-9]{3}-[0-9]{4}', mayor.text_content())[0].replace(' ', '-')
        p.add_contact('voice', phone, 'legislature')
        yield p

        rows = page.xpath('//tr')
        for i, row in enumerate(rows):
            if i % 2 == 0:
                continue
            councillors = row.xpath('./td')
            for j, councillor in enumerate(councillors):
                name = councillor.text_content()
                # rows[i + 1].xpath('.//td//a[contains(@href, "maps")]/text()')[j] # district number
                district = rows[i + 1].xpath('.//td/p[1]/text()')[j].replace(' / ', '/')

                p = Person(primary_org='legislature', name=name, district=district, role='Conseiller')
                p.add_source(COUNCIL_PAGE)
                p.image = councillor.xpath('.//img/@src')[0]

                phone = re.findall(r'[0-9]{3}[ -][0-9]{3}-[0-9]{4}', rows[i + 1].xpath('.//td')[j].text_content())[0].replace(' ', '-')

                p.add_contact('voice', phone, 'legislature')

                yield p
Example #15
0
    def scrape(self):
        page = self.lxmlize(COUNCIL_PAGE)

        councillors = page.xpath('//div[@class="member-container"]')
        for councillor in councillors:
            name = councillor.xpath('.//h3')[0].text_content()
            role = councillor.xpath(
                './/div[@class="member-position"]')[0].text_content()
            if 'Maire' in role:
                role = 'Maire'
                district = 'Westmount'
            else:
                role = 'Conseiller'
                district = councillor.xpath(
                    './/div[@class="entry-content"]/text()')[0]

            p = Person(primary_org='legislature',
                       name=name,
                       district=district,
                       role=role)
            p.add_source(COUNCIL_PAGE)

            p.image = councillor.xpath(
                './/a[@title="Photo pour la presse"]/@href')[0]
            p.add_contact('email', self.get_email(councillor))

            yield p
Example #16
0
    def scrape(self):
        page = self.lxmlize(COUNCIL_PAGE, 'utf-8')
        councillors = page.xpath('//td[@width="105"]')
        assert len(councillors), 'No councillors found'
        for node in councillors:
            url = urljoin(COUNCIL_PAGE, node.xpath('.//a/@href')[0])
            ward = re.search('([A-Z].+) Ward',
                             node.xpath('.//a//text()')[0]).group(1)
            ward = ward.replace(' – ', '—').replace(
                ' - ', '—')  # n-dash, m-dash, hyphen, m-dash
            ward = ward.replace('St. Norbert',
                                'St Norbert')  # to match ocd-division-ids
            name = ' '.join(node.xpath('.//span[@class="k80B"][1]/text()'))
            yield self.councillor_data(url, name, ward)

        mayor_node = page.xpath('//td[@width="315"]')[0]
        mayor_name = mayor_node.xpath('./a//text()')[0][len('Mayor '):]
        mayor_photo_url = mayor_node.xpath('./img/@src')[0]
        m = Person(primary_org='legislature',
                   name=mayor_name,
                   district='Winnipeg',
                   role='Mayor')
        m.add_source(COUNCIL_PAGE)
        # @see http://www.winnipeg.ca/interhom/mayor/MayorForm.asp?Recipient=CLK-MayorWebMail
        m.add_contact('email', '*****@*****.**')  # hardcoded
        m.image = mayor_photo_url
        yield m
Example #17
0
    def scrape(self):
        page = self.lxmlize(COUNCIL_PAGE, encoding='utf-8')
        councillors = page.xpath('//div[contains(@class," inner_member")]')
        assert len(councillors), 'No councillors found'

        for councillor in councillors:
            name = councillor.xpath('.//h2/text()')[0]
            district = councillor.xpath(
                './/div[contains(@class,"district")]/text()')[0].replace(
                    'numéro ', '')

            if 'Maire' in district:
                district = 'Saint-Jérôme'
                role = 'Maire'
            else:
                role = 'Conseiller'

            image = councillor.xpath(
                './/div[@class="portrait_single"]/img/@data-lazy-src')[0]
            contact = councillor.xpath(
                './/div[contains(@class,"phone")]/text()')[0]

            p = Person(primary_org='legislature',
                       name=name,
                       district=district,
                       role=role)
            p.add_source(COUNCIL_PAGE)
            p.image = image

            p.add_contact('voice', contact, 'legislature')
            p.add_contact('email', self.get_email(councillor))

            yield p
Example #18
0
    def scrape(self):
        page = self.lxmlize(COUNCIL_PAGE)
        nodes = page.xpath('//div[contains(@class,"cocis-has-caption")]')[1:]
        for node in nodes:
            url = urljoin(COUNCIL_PAGE, node.xpath('.//a[1]/@href')[0])
            name = node.xpath('.//a//text()')[0]
            ward = ' '.join(node.xpath('.//strong//text()')[0].split()[:-1])
            yield self.councillor_data(url, name, ward)

        mayor_node = page.xpath(
            '//div[contains(@class, "cocis-image-panel")]')[0]
        photo_url = urljoin(COUNCIL_PAGE, mayor_node.xpath('.//img/@src')[0])
        name = mayor_node.xpath('.//a//text()')[0]
        mayor_page = self.lxmlize(MAYOR_PAGE)
        # Email behind mailhide
        # email = self.get_email(mayor_page)
        phone = self.get_phone(mayor_page, area_codes=[403])
        m = Person(primary_org='legislature',
                   name=name,
                   district='Calgary',
                   role='Mayor')
        m.add_source(COUNCIL_PAGE)
        m.add_source(MAYOR_PAGE)
        m.add_contact('voice', phone, 'legislature')
        m.image = photo_url
        yield m
Example #19
0
    def scrape(self):
        page = self.lxmlize(COUNCIL_PAGE)

        yield self.scrape_mayor(
            page.xpath('//div[@class="img_four"][1]/div[1]')[0])

        councillors = page.xpath('//div[@class="img_four"][2]/div')
        for councillor_elem in councillors:
            name, position = councillor_elem.xpath('string(./p/strong)').split(
                ',')  # allow string()
            position = position.strip()
            position, district = position.split(' ', 1)
            district = post_number(district)
            addr = '\n'.join(
                addr_str.strip()
                for addr_str in councillor_elem.xpath('./p/text()')).strip()
            phone = councillor_elem.xpath(
                './/a[starts-with(@href, "tel:")]//text()')[0]
            image = councillor_elem.xpath('.//img[1]/@src')[0]
            p = Person(primary_org='legislature',
                       name=name,
                       district=district,
                       role=position,
                       image=image)
            p.add_source(COUNCIL_PAGE)
            p.add_contact('address', addr, 'legislature')
            p.add_contact('voice', phone, 'legislature')
            yield p
Example #20
0
    def scrape(self):
        # mayor first, can't find email
        page = self.lxmlize(MAYOR_URL)
        photo_url = page.xpath('//img/@src[contains(., "maire")]')[0]
        name = page.xpath('//td[@class="contenu"]/text()[last()]')[0]
        p = Person(primary_org='legislature', name=name, district="Trois-Rivières", role="Maire",
                   image=photo_url)
        p.add_source(MAYOR_URL)
        yield p

        resp = self.get(COUNCIL_PAGE)
        # page rendering through JS on the client
        page_re = re.compile(r'createItemNiv3.+"District (.+?)".+(index.+)\\"')
        for district, url_rel in page_re.findall(resp.text):
            if district not in ('des Estacades', 'des Plateaux', 'des Terrasses', 'du Sanctuaire'):
                district = re.sub('\A(?:de(?: la)?|des|du) ', '', district)

            url = urljoin(COUNCIL_PAGE, url_rel)
            page = self.lxmlize(url)

            name_content = page.xpath('//h2//text()')
            if name_content:
                name = name_content[0]
                email = self.get_email(page)
                photo_url = page.xpath('//img/@src[contains(., "Conseiller")]')[0]
                p = Person(primary_org='legislature', name=name, district=district, role='Conseiller',
                           image=photo_url)
                p.add_source(url)
                p.add_contact('email', email)
                yield p
Example #21
0
    def scrape(self):
        page = self.lxmlize(COUNCIL_PAGE, user_agent=CUSTOM_USER_AGENT)

        mayor_url = page.xpath('//a[contains(text(), "Mayor")]/@href')[0]
        mayor = self.scrape_mayor(mayor_url)
        if mayor:
            yield mayor

        councillors_url = page.xpath('//a[contains(text(), "Councillors")]/@href')[0]
        cpage = self.lxmlize(councillors_url, user_agent=CUSTOM_USER_AGENT)

        councillors = cpage.xpath('//tr[td//img]')[:-1]

        assert len(councillors), 'No councillors found'
        for councillor_row in councillors:
            img_cell, info_cell = tuple(councillor_row)
            if info_cell.xpath('.//p//text()[contains(., "Vacant")]'):
                continue
            cells = [x.strip() for x in info_cell.xpath('.//text()') if re.sub('\xa0', ' ', x).strip()]
            name = cells[0].replace('Councillor ', '')
            district = info_cell.xpath('.//p[contains(text(), "District")]//text()')[0]
            email = self.get_email(info_cell)
            phone = self.get_phone(info_cell, area_codes=[438, 514], error=False)
            img_url_rel = img_cell.xpath('.//img/@src')[0]
            img_url = urljoin(councillors_url, img_url_rel)

            p = Person(primary_org='legislature', name=name, district=district, role='Conseiller')
            p.add_source(COUNCIL_PAGE)
            p.add_source(councillors_url)
            p.add_contact('email', email)
            if phone:
                p.add_contact('voice', phone, 'legislature')
            p.image = img_url
            yield p
Example #22
0
    def scrape(self):
        page = self.lxmlize(COUNCIL_PAGE,
                            user_agent=CUSTOM_USER_AGENT,
                            encoding='windows-1252')

        councillors = page.xpath('//table[@width="800"]/tr')
        assert len(councillors), 'No councillors found'
        for councillor in councillors:
            if councillor == councillors[0]:
                name = councillor.xpath('.//strong/text()')[0].replace(
                    'Monsieur', '').replace('Madame', '').strip()
                role = 'Maire'
                district = 'Mercier'
            else:
                name = councillor.xpath('.//strong/text()')[0].replace(
                    'Monsieur', '').replace('Madame', '').strip()
                role = 'Conseiller'
                district = 'District {}'.format(
                    re.search(r'(\d)',
                              councillor.xpath('.//text()')[3]).group(1))

            email = self.get_email(councillor)

            p = Person(primary_org='legislature',
                       name=name,
                       district=district,
                       role=role)
            p.add_source(COUNCIL_PAGE)
            p.add_contact('email', email)
            yield p
Example #23
0
    def scrape(self):
        councillor_seat_number = 1

        page = self.lxmlize(COUNCIL_PAGE)

        for person_url in page.xpath('//h4/a/@href'):
            page = self.lxmlize(person_url)

            role, name = page.xpath('//title//text()')[0].split(' ', 1)
            photo_url = page.xpath('//div[@id="content"]//img[@style]/@src')[0]

            contact_node = page.xpath('//div[@id="column-right"]//div[contains(., "Contact")]')
            if contact_node:
                email = self.get_email(contact_node[0])
                phone = self.get_phone(contact_node[0], area_codes=[604, 778])

            if role == 'Mayor':
                district = 'Burnaby'
            else:
                district = 'Burnaby (seat {})'.format(councillor_seat_number)
                councillor_seat_number += 1

            p = Person(primary_org='legislature', name=name, district=district, role=role, image=photo_url)
            p.add_source(COUNCIL_PAGE)
            p.add_source(person_url)
            p.add_contact('email', email)
            if phone:
                p.add_contact('voice', phone, 'legislature')
            yield p
Example #24
0
    def scrape(self):
        councillor_seat_number = 1

        page = self.lxmlize(COUNCIL_PAGE)
        councillors = page.xpath(
            '//div[@id="content"]//table//tr[position() mod 2 = 1]')
        assert len(councillors), 'No councillors found'
        for councillor in councillors:
            text = councillor.xpath('.//strong/text()')[0]
            if 'Deputy Warden' in text:
                role = 'Deputy Warden'
                name = text.replace('Deputy Warden', '')
                district = 'Lambton'
            elif 'Warden' in text:
                role = 'Warden'
                name = text.replace('Warden', '')
                district = 'Lambton'
            else:
                role = 'Councillor'
                name = text
                district = 'Lambton (seat {})'.format(councillor_seat_number)
                councillor_seat_number += 1

            p = Person(primary_org='legislature',
                       name=name,
                       district=district,
                       role=role)
            p.add_source(COUNCIL_PAGE)

            p.image = councillor.xpath('.//img/@src')[0]
            p.add_contact('email', self.get_email(councillor))

            yield p
Example #25
0
    def scrape(self):
        councillor_seat_number = 1

        page = self.lxmlize(COUNCIL_PAGE)
        councillors = page.xpath('//div[contains(@class, "entry")]')[0].xpath('.//@href')
        assert len(councillors), 'No councillors found'
        for url in councillors:
            if '@' in url:
                continue

            page = self.lxmlize(url)
            main = page.xpath('//main[@id="content"]')[0]

            name = main.xpath('.//h1//text()')[0]

            if 'Mayor' in main.text_content():
                name = name.replace('Mayor ', '')
                role = 'Mayor'
                district = 'Saanich'
            else:
                role = 'Councillor'
                district = 'Saanich (seat {})'.format(councillor_seat_number)
                councillor_seat_number += 1

            p = Person(primary_org='legislature', name=name, district=district, role=role)
            p.image = page.xpath('.//@src')[0]
            p.add_contact('voice', self.get_phone(page, area_codes=[250]), 'legislature')
            p.add_contact('email', self.get_email(page.xpath('//main[@id="content"]')[0]))
            p.add_source(COUNCIL_PAGE)
            p.add_source(url)
            yield p
Example #26
0
    def scrape(self):
        regional_councillor_seat_number = 1
        page = self.lxmlize(COUNCIL_PAGE)

        yield self.scrape_mayor(page)

        councillors = page.xpath('//h3[contains(text(), "Councillors")]/following-sibling::p')[:-1]
        assert len(councillors), 'No councillors found'
        for councillor_node in councillors:
            text = councillor_node.xpath('./strong/text()')
            if not text or 'Vacant' in text:
                continue

            name, role_district = text
            name = name.rstrip(',')

            if 'Regional Councillor' in role_district:
                role = role_district
                district = 'Whitby (seat {})'.format(regional_councillor_seat_number)
                regional_councillor_seat_number += 1
            else:
                role, district = role_district.strip().split(', ')
                district = district.split(' (')[0]

            email = self.get_email(councillor_node)
            image = councillor_node.xpath('./img/@src')[0]
            p = Person(primary_org='legislature', name=name, district=district, role=role, image=image)
            p.add_source(COUNCIL_PAGE)
            p.add_contact('email', email)
            yield p
Example #27
0
    def scrape(self):

        page = self.lxmlize(COUNCIL_PAGE)

        councillors = page.xpath('//p[@class="WSIndent"]/a')
        for councillor in councillors:
            district = re.findall(r'(Ward [0-9]{1,2})',
                                  councillor.text_content())
            if district:
                district = district[0]
                name = councillor.text_content().replace(district, '').strip()
                role = 'Councillor'
            else:
                district = 'Kawartha Lakes'
                name = councillor.text_content().replace('Mayor', '').strip()
                role = 'Mayor'

            url = councillor.attrib['href']
            page = self.lxmlize(url)
            email = self.get_email(page)
            image = page.xpath('//img[@class="image-right"]/@src')[0]

            p = Person(primary_org='legislature',
                       name=name,
                       district=district,
                       role=role)
            p.add_source(COUNCIL_PAGE)
            p.add_source(url)
            p.add_contact('email', email)
            p.image = image
            yield p
Example #28
0
    def scrape(self):
        def char(code):
            try:
                return chr(int(code))
            except ValueError:
                return code

        page = self.lxmlize(COUNCIL_PAGE)
        for row in page.xpath('//div[@id="content"]/table/tbody/tr'):
            if 'Vacant' not in row.xpath('./td//text()')[0]:
                full_name, party, district = row.xpath('./td//text()')[:3]
                name = ' '.join(reversed(full_name.split(',')))

                p = Person(primary_org='legislature', name=name, district=district, role='MLA', party=self.PARTIES[party])

                detail_url = row[0][0].attrib['href']
                detail = self.lxmlize(detail_url)

                image = detail.xpath('//img[@class="portrait"]/@src')[0]
                p.image = image

                try:
                    p.add_contact('voice', detail.xpath('//dd[@class="numbers"]/text()')[0].split(': ')[1], 'legislature')
                except IndexError:
                    pass

                script = detail.xpath('//dd/script/text()')
                if script:
                    codes = reversed(re.findall(r"]='(.+?)'", script[0]))
                    content = ''.join(char(code) for code in codes)
                    p.add_contact('email', re.search(r'>(.+)<', content).group(1))

                p.add_source(COUNCIL_PAGE)
                p.add_source(detail_url)
                yield p
Example #29
0
    def scrape(self):
        councillor_seat_number = 1

        contact_page = self.lxmlize(CONTACT_URL)
        email = self.get_email(contact_page)

        page = self.lxmlize(COUNCIL_PAGE)
        for url in page.xpath('//a/@href[contains(., "members/")]'):
            page = self.lxmlize(url)
            role, name = page.xpath('//h1//text()')[0].split(' ', 1)
            photo_url = page.xpath('//img/@src')[0]

            if role == 'Mayor':
                district = 'Richmond'
            else:
                district = 'Richmond (seat {})'.format(councillor_seat_number)
                councillor_seat_number += 1

            p = Person(primary_org='legislature', name=name, district=district, role=role)
            p.image = photo_url
            p.add_source(COUNCIL_PAGE)
            p.add_source(CONTACT_URL)
            p.add_source(url)
            p.add_contact('email', email)
            yield p
Example #30
0
    def scrape(self):
        page = self.lxmlize(COUNCIL_PAGE)

        councillors = page.xpath('//div[@class="article-content"]//td[@class="ms-rteTableOddCol-0"]')
        yield self.scrape_mayor(councillors[0])
        assert len(councillors), 'No councillors found'
        for councillor in councillors[1:]:
            if not councillor.xpath('.//a'):
                continue

            texts = [text for text in councillor.xpath('.//text()') if clean_string(text)]
            name = texts[0]
            district = texts[1]
            url = councillor.xpath('.//a/@href')[0]
            page = self.lxmlize(url)

            p = Person(primary_org='legislature', name=name, district=district, role='Conseiller')
            p.add_source(COUNCIL_PAGE)
            p.add_source(url)

            p.image = councillor.xpath('./preceding-sibling::td//img/@src')[-1]

            contacts = page.xpath('.//td[@class="ms-rteTableOddCol-0"]//text()')
            for contact in contacts:
                if re.findall(r'[0-9]{4}', contact):
                    phone = contact.strip().replace(' ', '-')
                    p.add_contact('voice', phone, 'legislature')
            get_links(p, page.xpath('.//td[@class="ms-rteTableOddCol-0"]')[0])

            email = self.get_email(page)
            p.add_contact('email', email)
            yield p
Example #31
0
    def scrape(self):
        page = self.lxmlize(COUNCIL_PAGE)

        councillors = page.xpath('//div[@id="printArea"]//strong')
        for councillor in councillors:
            info = councillor.xpath('./parent::p/text()')
            if not info:
                info = councillor.xpath('./parent::div/text()')
            info = [x for x in info if x.strip()]
            district = re.sub(r'(?<=Ward \d).+', '', info.pop(0))
            if 'Mayor' in district:
                district = 'Woolwich'
                role = 'Mayor'
            else:
                district = district.replace('Councillor', '').strip()
                role = 'Councillor'

            p = Person(primary_org='legislature', name=councillor.text_content(), district=district, role=role)
            p.add_source(COUNCIL_PAGE)
            p.image = councillor.xpath('./img/@src')[0]

            for contact in info:
                note, num = contact.split(':')
                num = num.strip().replace('(', '').replace(') ', '-').replace('extension ', 'x')
                p.add_contact(note, num, note)
            yield p
Example #32
0
    def scrape(self):
        page = self.lxmlize(COUNCIL_PAGE)

        councillors = page.xpath('//div[contains(@class, "ligne")]')
        for councillor in councillors:

            name = ' '.join(councillor.xpath('.//h3')[0].text_content().strip().split(', ')[::-1])
            if 'vacant' in name:
                continue
            district = councillor.xpath('./preceding-sibling::h2/text()')[-1]
            if 'Mairie' in district:
                district = 'Québec'
                role = 'Maire'
            else:
                text = councillor.xpath('.//a[@target="_blank"]/text()')
                district = re.search('\ADistrict électoral (?:de|du|des) (.+) - ?\d+\Z', text[0].strip().replace('\xa0', ''), flags=re.U).group(1)
                role = 'Conseiller'

            if district == 'Monts':
                district = 'Les Monts'
            elif district == 'Plateau':
                district = 'Le Plateau'
            else:
                district = re.sub('–', '—', district)  # n-dash, m-dash
                district = re.sub('\Ala ', 'La ', district)

            p = Person(primary_org='legislature', name=name, district=district, role=role)
            p.add_source(COUNCIL_PAGE)
            p.image = councillor.xpath('./p//img/@src')[0]

            phone = self.get_phone(councillor, area_codes=[418])
            p.add_contact('voice', phone, 'legislature')
            yield p
Example #33
0
    def scrape(self):
        page = self.lxmlize(COUNCIL_PAGE, encoding='utf-8')
        members = page.xpath('//table/tbody/tr')
        assert len(members), 'No members found'
        for row in members:
            riding, table_name, email = (' '.join(td.text_content().split()) for td in row[1:])

            if 'Vacant' in table_name:
                continue

            district = riding.replace('\x97', '-')
            name_with_status, party_abbr = re.match(r'(.+) \((.+)\)', table_name).groups()
            name = name_with_status.split(',')[0]
            photo_page_url = row[2][0].attrib['href']
            photo_url = self.get_photo_url(photo_page_url)

            # @see https://en.wikipedia.org/wiki/Charlotte-Campobello
            if district == 'Saint Croix':
                district = 'Charlotte-Campobello'
            # @see https://en.wikipedia.org/wiki/Oromocto-Lincoln-Fredericton
            elif district == 'Oromocto-Lincoln-Fredericton':
                district = 'Oromocto-Lincoln'

            p = Person(primary_org='legislature', name=name, district=district, role='MLA',
                       party=get_party(party_abbr.strip()), image=photo_url)
            p.add_contact('email', email)
            p.add_source(photo_page_url)
            p.add_source(COUNCIL_PAGE)
            yield p
Example #34
0
    def scrape(self):
        page = self.lxmlize(COUNCIL_PAGE, 'utf-8')
        councillors = page.xpath('//div[contains(@class, "member-box member-box--")]')
        assert len(councillors), 'No councillors found'
        for councillor in councillors:
            name = councillor.xpath('.//div[@class="fiche__name"]/text()')[0]
            phone = councillor.xpath('.//div[@class="fiche__social"]/span/text()')[0].split('T')[1]
            email_mailto = councillor.xpath('.//div[@class="fiche__social"]/a[contains(@href, "mailto")]/@href')
            photo_url = councillor.xpath('.//img')[0].attrib['src']

            page = self.lxmlize(councillor.xpath('.//a[@class="member-box__calltoaction"]/@href')[0])
            district = page.xpath('.//div[@class="fiche__category"]/text()')[0]

            if district == 'Maire':
                district = 'Terrebonne'
                role = 'Maire'
            else:
                district = 'District {}'.format(district)
                role = 'Conseiller'

            p = Person(primary_org='legislature', name=name, district=district, role=role, image=photo_url)
            p.add_source(COUNCIL_PAGE)
            p.add_contact('voice', phone, 'legislature')
            if email_mailto:
                email = email_mailto[0].split('mailto:')[1]
                p.add_contact('email', email)
            yield p
Example #35
0
    def scrape(self):
        page = self.lxmlize(COUNCIL_PAGE)
        members = page.xpath('//table[1]//tr')

        assert len(members), 'No members found'
        for member in members:
            if not member.text_content().strip():
                continue

            name = member.xpath('./td[2]//a[1]//text()')[0]

            district_name = member.xpath(
                './td[2]//a[contains(.//text(), "MLA")]//text()')[0].split(
                    ':')[1].replace('St ', 'St. ').split('-')
            district = district_name[0].strip() + '-' + district_name[1].strip(
            )
            url = member.xpath('./td[2]//a[1]/@href')[0]
            ext_infos = self.scrape_extended_info(url)
            p = Person(primary_org='legislature',
                       name=name,
                       district=district,
                       role='MLA')
            p.add_source(COUNCIL_PAGE)
            p.add_source(url)

            if ext_infos:  # member pages might return errors
                email, phone, photo_url = ext_infos
                p.image = photo_url
                if email:
                    p.add_contact('email', email)
                if phone:
                    p.add_contact('voice', phone, 'legislature')
            yield p
Example #36
0
    def scrape(self):
        page = self.lxmlize(COUNCIL_PAGE)

        sections = page.xpath('//div[contains(@class, "membres-conseil-municipal")]')
        for section in sections:
            councillors = section.xpath('./div')
            assert len(councillors), 'No councillors found'
            for councillor in councillors:
                name = ' '.join(reversed(councillor.xpath('./h3//text()')))
                if 'vacant' in name.lower():
                    continue

                header = section.xpath('./preceding-sibling::h2/text()')[-1]
                if 'Mairie' in header:
                    district = 'Québec'
                    role = 'Maire'
                else:
                    district = councillor.xpath('./p[@itemprop="jobTitle"]/a/text()')[0]
                    district = re.search(r'\ADistrict (?:de(?: la)?|du|des) ([\w —–-]+)', district, flags=re.U).group(1)
                    role = 'Conseiller'

                if district == 'Saules':
                    district = 'Les Saules'
                else:
                    district = re.sub(r'–', '—', district)  # n-dash, m-dash

                p = Person(primary_org='legislature', name=name, district=district, role=role)
                p.add_source(COUNCIL_PAGE)
                p.image = councillor.xpath('./figure//@src')[0]
                p.add_contact('voice', self.get_phone(councillor, area_codes=[418]), 'legislature')
                yield p
Example #37
0
    def scrape(self):
        councillor_seat_number = 1
        regional_councillor_seat_number = 1

        page = self.lxmlize(COUNCIL_PAGE)
        councillors = page.xpath('//table//td')

        assert len(councillors), 'No councillors found'
        for councillor in councillors:
            if councillor.xpath('./p[1]/text()'):
                name, role = councillor.xpath('./p[1]/text()')
            else:
                name, role = councillor.xpath('./span[1]/text()')

            role = role.strip()

            if role == 'City Councillor':
                role = 'Councillor'
                district = 'Oshawa (seat {})'.format(councillor_seat_number)
                councillor_seat_number += 1
            elif role == 'Regional and City Councillor':
                role = 'Regional Councillor'
                district = 'Oshawa (seat {})'.format(regional_councillor_seat_number)
                regional_councillor_seat_number += 1
            else:
                district = 'Oshawa'

            photo_url = councillor.xpath('./p/img/@src')[0]
            phone = self.get_phone(councillor.xpath('./p[contains(.//text(), "Phone")]')[0], area_codes=[905])

            p = Person(primary_org='legislature', name=name, district=district, role=role, image=photo_url)
            p.add_source(COUNCIL_PAGE)
            p.add_contact('voice', phone, 'legislature')
            p.add_contact('email', self.get_email(councillor))
            yield p
Example #38
0
    def scrape(self):
        page = self.lxmlize(COUNCIL_PAGE)

        councillors = page.xpath('//table[@id="Table1table"]/tbody/tr')
        assert len(councillors), 'No councillors found'
        for councillor in councillors:
            name = councillor.xpath('./td[2]/p/text()')[1]
            role = councillor.xpath('./td[2]/p/text()')[0].strip()
            if role == 'Mayor and Regional Councillor':
                role = 'Mayor'
            elif role == 'Local & Regional Councillor':
                role = 'Regional Councillor'
            elif role == 'Local Councillor':
                role = 'Councillor'
            if len(councillor.xpath('./td[2]/p/text()')) < 3:
                district = 'Milton'
            else:
                district = councillor.xpath('./td[2]/p/text()')[2]

            p = Person(primary_org='legislature', name=name, district=district, role=role)
            p.add_source(COUNCIL_PAGE)

            p.image = councillor.xpath('./td[1]/p//img/@src')[0]

            if councillor == councillors[0]:
                address = ', '.join(councillor.xpath('./td[3]/p[1]/text()')).replace('Email:', '').strip()
                p.add_contact('address', address, 'legislature')

            numbers = councillor.xpath('./td[3]/p[2]/text()')
            for number in numbers:
                num_type, number = number.split(':')
                number = number.replace(', ext ', ' x').strip()
                p.add_contact(num_type, number, num_type)

            yield p
Example #39
0
    def scrape(self):
        page = self.lxmlize(COUNCIL_PAGE)

        councillors = page.xpath(
            '//section[contains(@id, "js-council-member")]')
        assert len(councillors), 'No councillors found'
        for index, councillor in enumerate(councillors):
            name = ' '.join(councillor.xpath('.//h2/text()'))
            district = councillor.xpath(
                './/span[contains(@class, "c-info-list_label")][contains(text(), "District ")]'
            )
            role = 'Conseiller'

            if not district and index == 0:
                district = 'Pointe-Claire'
                role = 'Maire'
            elif district:
                district = district[0].text_content().split(' – ')[0]

            p = Person(primary_org='legislature',
                       name=name,
                       district=district,
                       role=role)
            p.image = councillor.xpath('.//@src')[0]
            p.add_contact('email', self.get_email(councillor))
            p.add_contact('voice', self.get_phone(councillor,
                                                  area_codes=[514]),
                          'legislature')
            p.add_source(COUNCIL_PAGE)
            yield p
Example #40
0
    def scrape(self):
        page = self.lxmlize(COUNCIL_PAGE)

        yield self.scrape_mayor()

        councillors = page.xpath('//h2[@class="landing-block-title"]/a')[:-1]
        for councillor in councillors:
            url = councillor.attrib['href']
            page = self.lxmlize(url)

            district = page.xpath('//div[@id="main-content"]/h1/text()')[0]
            name = page.xpath('//div[@id="main-content"]/h2/text()')[0]

            p = Person(primary_org='legislature', name=name, district=district, role='Councillor')
            p.add_source(COUNCIL_PAGE)
            p.add_source(url)

            contacts = page.xpath('//aside[@class="page-sidebar"]/div[1]/p')
            for contact in contacts[:-1]:
                contact_type = contact.xpath('./strong/text()')[0]
                if 'Contact' in contact_type:
                    continue
                value = contact.xpath('./a/text()')[0]
                if 'Fax' in contact_type:
                    p.add_contact('fax', value, 'legislature')
                if 'Phone' in contact_type:
                    p.add_contact(contact_type, value, contact_type)

            yield p
Example #41
0
    def scrape(self):
        member_page = self.lxmlize(COUNCIL_PAGE, encoding='utf-8')
        table = member_page.xpath('//table')[0]
        rows = table.xpath('.//tr')[1:]
        assert len(rows), 'No members found'
        for row in rows:
            (namecell, constitcell, partycell) = row.xpath('.//td')
            full_name = namecell.text_content().strip()
            if full_name.lower() == 'vacant':
                continue
            (last, first) = full_name.split(',')
            name = first.replace('Hon.', '').strip() + ' ' + last.title().strip()
            district = ' '.join(constitcell.text_content().split())
            party = get_party(partycell.text)

            url = namecell.xpath('.//a')[0].get('href')

            page = self.lxmlize(url)
            email = self.get_email(page)

            p = Person(primary_org='legislature', name=name, district=district, role='MLA', party=party)
            p.add_source(COUNCIL_PAGE)
            p.add_source(url)
            p.add_contact('email', email)

            image = page.xpath('//img[@class="page_graphic"]/@src')
            if image:
                p.image = image[0]

            yield p
Example #42
0
    def scrape(self):
        page = self.lxmlize(COUNCIL_PAGE)
        for block in page.xpath('//div[@class="addressblock"]'):
            name_elem = block.xpath('.//a[@class="mpp"]')[0]
            name = ' '.join(name_elem.text.split())

            riding = block.xpath('.//div[@class="riding"]//text()')[0].strip().replace('--', '\u2014')
            district = riding.replace('Chatham—Kent', 'Chatham-Kent')  # m-dash to hyphen
            mpp_url = name_elem.attrib['href']

            mpp_page = self.lxmlize(mpp_url)

            image = mpp_page.xpath('//img[@class="mppimg"]/@src')
            party = mpp_page.xpath('//div[@class="mppinfoblock"]/p[last()]/text()')[0].strip()

            p = Person(primary_org='legislature', name=name, district=district, role='MPP', party=party)
            if image:
                p.image = image[0]
            p.add_source(COUNCIL_PAGE)
            p.add_source(mpp_url)

            email = block.xpath('.//div[@class="email"]')
            if email:
                p.add_contact('email', self.get_email(email[0]))

            phone = block.xpath('.//div[@class="phone"]//text()')
            if phone:
                p.add_contact('voice', phone[0], 'legislature')

            yield p
Example #43
0
    def scrape(self):
        page = self.lxmlize(COUNCIL_PAGE, encoding='utf-8')
        members = page.xpath('//table/tbody/tr')
        assert len(members), 'No members found'
        for row in members:
            riding, table_name, email = (' '.join(td.text_content().split())
                                         for td in row[1:])

            if 'Vacant' in table_name:
                continue

            district = riding.replace('\x97', '-')
            name_with_status, party_abbr = re.match(r'(.+) \((.+)\)',
                                                    table_name).groups()
            name = name_with_status.split(',')[0]
            photo_page_url = row[2][0].attrib['href']
            photo_url = self.get_photo_url(photo_page_url)

            # @see https://en.wikipedia.org/wiki/Charlotte-Campobello
            if district == 'Saint Croix':
                district = 'Charlotte-Campobello'
            # @see https://en.wikipedia.org/wiki/Oromocto-Lincoln-Fredericton
            elif district == 'Oromocto-Lincoln-Fredericton':
                district = 'Oromocto-Lincoln'

            p = Person(primary_org='legislature',
                       name=name,
                       district=district,
                       role='MLA',
                       party=get_party(party_abbr.strip()),
                       image=photo_url)
            p.add_contact('email', email)
            p.add_source(photo_page_url)
            p.add_source(COUNCIL_PAGE)
            yield p
Example #44
0
    def scrape(self):
        member_page = self.lxmlize(COUNCIL_PAGE, encoding='utf-8')
        table = member_page.xpath('//table')[0]
        rows = table.xpath('.//tr')[1:]
        assert len(rows), 'No members found'
        for row in rows:
            (namecell, constitcell, partycell) = row.xpath('.//td')
            full_name = namecell.text_content().strip()
            if full_name.lower() == 'vacant':
                continue
            (last, first) = full_name.split(',')
            name = first.replace('Hon.',
                                 '').strip() + ' ' + last.title().strip()
            district = ' '.join(constitcell.text_content().split())
            party = get_party(partycell.text)

            url = namecell.xpath('.//a')[0].get('href')

            page = self.lxmlize(url)
            email = self.get_email(page)

            p = Person(primary_org='legislature',
                       name=name,
                       district=district,
                       role='MLA',
                       party=party)
            p.add_source(COUNCIL_PAGE)
            p.add_source(url)
            p.add_contact('email', email)

            image = page.xpath('//img[@class="page_graphic"]/@src')
            if image:
                p.image = image[0]

            yield p
Example #45
0
    def scrape(self):
        seat_numbers = defaultdict(int)

        page = self.lxmlize(COUNCIL_PAGE)

        yield self.scrape_mayor()

        councillors = page.xpath('//div[@id="centre_content"]//tr')
        for councillor in councillors:
            if 'Position' in councillor.text_content():
                continue

            ward = councillor.xpath('./td')[0].text_content().replace('Councillor', '')
            seat_numbers[ward] += 1
            district = '{} (seat {})'.format(ward, seat_numbers[ward])
            name = councillor.xpath('./td')[1].text_content()
            url = councillor.xpath('./td/a')[0].attrib['href']

            p = Person(primary_org='legislature', name=name, district=district, role='Councillor')
            p.add_source(COUNCIL_PAGE)
            p.add_source(url)

            page = self.lxmlize(url)

            content = page.xpath('//div[@id="centre_content"]')[0]
            email = self.get_email(content)
            p.add_contact('email', email)
            p.add_contact('voice', self.get_phone(content, area_codes=[226, 519]), 'legislature')

            p.image = page.xpath('string(//div[@id="centre_content"]//img/@src)')  # can be empty

            if len(page.xpath('//div[@id="centre_content"]//a')) > 2:
                p.add_link(page.xpath('//div[@id="centre_content"]//a')[-1].attrib['href'])
            yield p
Example #46
0
    def scrape(self):
        regional_councillor_seat_number = 1
        page = self.lxmlize(COUNCIL_PAGE)

        yield self.scrape_mayor(page)

        councillor_nodes = page.xpath('//h3[contains(text(), "Councillors")]/following-sibling::p')[:-1]
        for councillor_node in councillor_nodes:
            text = ' '.join(councillor_node.xpath('./strong/text()'))
            if not text or 'Vacant' in text:
                continue

            name, role_district = text.split(', ', 1)

            if 'Regional Councillor' in role_district:
                role = role_district
                district = 'Whitby (seat {})'.format(regional_councillor_seat_number)
                regional_councillor_seat_number += 1
            else:
                role, district = role_district.strip().split(', ')
                district = district.split(' (')[0]

            email = self.get_email(councillor_node)
            image = councillor_node.xpath('./img/@src')[0]
            p = Person(primary_org='legislature', name=name, district=district, role=role, image=image)
            p.add_source(COUNCIL_PAGE)
            p.add_contact('email', email)
            yield p
Example #47
0
    def scrape(self):

        page = self.lxmlize(COUNCIL_PAGE)

        councillors = page.xpath('//p[@class="WSIndent"]/a')
        for councillor in councillors:
            district = re.findall(r'(Ward [0-9]{1,2})', councillor.text_content())
            if district:
                district = district[0]
                name = councillor.text_content().replace(district, '').strip()
                role = 'Councillor'
            else:
                district = 'Kawartha Lakes'
                name = councillor.text_content().replace('Mayor', '').strip()
                role = 'Mayor'

            url = councillor.attrib['href']
            page = self.lxmlize(url)
            email = self.get_email(page)
            image = page.xpath('//img[@class="image-right"]/@src')[0]

            p = Person(primary_org='legislature', name=name, district=district, role=role)
            p.add_source(COUNCIL_PAGE)
            p.add_source(url)
            p.add_contact('email', email)
            p.image = image
            yield p
Example #48
0
    def scrape(self):
        councillor_seat_number = 1

        coun_page = self.lxmlize(COUNCIL_PAGE)
        contact_page = self.lxmlize(CONTACT_PAGE)
        councillors = coun_page.xpath('//div[@id="main-content"]//h3')
        contact_data = contact_page.xpath('//p[contains(./strong/text(), "Mayor & Council")]/following-sibling::table[1]//tr')[1:]

        for councillor, contact in zip(councillors, contact_data):
            text = councillor.text_content()
            if text.startswith('Councill'):
                role = 'Councillor'
                district = 'Abbotsford (seat {})'.format(councillor_seat_number)
                councillor_seat_number += 1
            else:
                role = 'Mayor'
                district = 'Abbotsford'
            name = text.split(' ', 1)[1]
            image = councillor.xpath('./img/@src')[0]
            phone = contact.xpath('./td[2]/text()')[0]
            fax = contact.xpath('./td[3]/text()')[0]

            p = Person(primary_org='legislature', name=name, district=district, role=role)
            p.add_source(COUNCIL_PAGE)
            p.add_source(CONTACT_PAGE)
            p.image = image
            p.add_contact('voice', phone, 'legislature')
            p.add_contact('fax', fax, 'legislature')

            yield p
Example #49
0
    def scrape(self):
        page = self.lxmlize(COUNCIL_PAGE, 'iso-8859-1')

        yield self.scrape_mayor()

        councillors = page.xpath('//div[@class="articlebody-inside"]//p[contains(text(),"-")]')
        for councillor in councillors:
            url = councillor.xpath('.//a')[0].attrib['href'].replace('../', '')
            page = self.lxmlize(url, 'iso-8859-1')

            name = page.xpath('//div[@class="articletitle"]/h1')[0].text_content().replace('Councillor', '').replace('Deputy Mayor', '')
            district = 'Ward {}'.format(re.sub(r'\D+', '', page.xpath('//div[@class="articlebody-inside"]/p')[0].text_content()))

            p = Person(primary_org='legislature', name=name, district=district, role='Councillor')
            p.add_source(COUNCIL_PAGE)
            p.add_source(url)

            photo_url_rel = page.xpath('//div[@class="articlebody-inside"]/p/img/@src')[0].replace('/..', '')
            p.image = urljoin(url, photo_url_rel)

            contacts = page.xpath('//div[@class="articlebody-inside"]/p')[1].text_content().replace('Biography', '').replace('Committees', '').split(':')
            for i, contact in enumerate(contacts):
                if i == 0 or not contact:
                    continue
                contact_type = re.findall(r'([A-Z][a-z]+)', contacts[i - 1])[0]
                if contact_type != 'Address':
                    contact = re.split(r'[A-Z]', contact)[0]
                contact_type = CONTACT_DETAIL_TYPE_MAP[contact_type]
                p.add_contact(contact_type, contact, '' if contact_type == 'email' else 'legislature')
            yield p
Example #50
0
    def scrape(self):
        regional_councillor_seat_number = 1
        page = self.lxmlize(COUNCIL_PAGE)

        councillors = page.xpath('//a[@title="Mayor and Council::Meet Your Council"]/following-sibling::ul//@href')
        assert len(councillors), 'No councillors found'
        for councillor in councillors:
            node = self.lxmlize(councillor).xpath('//div[@id="printArea"]')[0]
            name = node.xpath('.//h1/text()')[0]

            if 'Mayor' in name:
                role = 'Mayor'
                district = 'Whitby'
                name = name.replace('Mayor ', '')
            else:
                role = node.xpath('.//h2/text()')[0]
                if 'Regional Councillor' in role:
                    district = 'Whitby (seat {})'.format(regional_councillor_seat_number)
                    regional_councillor_seat_number += 1
                else:
                    role, district = role.split(', ')
                    district = district.split(' (')[0]

            image = node.xpath('.//img/@src')[0]

            p = Person(primary_org='legislature', name=name, district=district, role=role)
            p.add_source(COUNCIL_PAGE)
            p.add_contact('voice', self.get_phone(node), 'legislature')
            p.add_contact('email', self.get_email(node))
            p.image = image

            yield p
Example #51
0
    def scrape(self):
        page = self.lxmlize(COUNCIL_PAGE)

        councillors = page.xpath(
            '//div[contains(@class, "view-people")]//div[contains(@class, "views-row")]'
        )
        assert len(councillors), 'No councillors found'
        for councillor in councillors:
            name = councillor.xpath(
                './/div[@property="dc:title"]')[0].text_content()
            role_and_district = councillor.xpath(
                './/div[contains(@class, "field-name-field-sub-title")]//p'
            )[-2].text_content().replace('\xa0', ' ')

            if role_and_district == 'Mayor':
                district = 'Fredericton'
                role = 'Mayor'
            else:
                district = role_and_district.split(', ', 1)[1]
                role = 'Councillor'

            url = councillor.xpath('.//@href')[0]
            page = self.lxmlize(url)

            p = Person(primary_org='legislature',
                       name=name,
                       district=district,
                       role=role)
            p.image = councillor.xpath('.//img[@typeof="foaf:Image"]/@src')[0]
            p.add_contact('email', self.get_email(page))
            p.add_contact('voice', self.get_phone(page, area_codes=[506]),
                          'legislature')
            p.add_source(COUNCIL_PAGE)
            p.add_source(url)
            yield p
Example #52
0
    def scrape(self):
        page = self.lxmlize(COUNCIL_PAGE)
        councillors = page.xpath('//table//td[*]')

        assert len(councillors), 'No councillors found'
        for councillor in councillors:
            district, role, name = councillor.xpath('./p[1]/text()')
            role = role.strip()

            if district == 'City of Oshawa':
                district = 'Oshawa'

            if role == 'City Councillor':
                role = 'Councillor'
            elif role == 'Regional & City Councillor':
                role = 'Regional Councillor'

            photo_url = councillor.xpath('./p/img/@src')[0]
            phone = self.get_phone(
                councillor.xpath('./p[contains(.//text(), "Phone")]')[0],
                area_codes=[905])

            p = Person(primary_org='legislature',
                       name=name,
                       district=district,
                       role=role,
                       image=photo_url)
            p.add_source(COUNCIL_PAGE)
            p.add_contact('voice', phone, 'legislature')
            p.add_contact('email', self.get_email(councillor))
            yield p
Example #53
0
    def councillor_data(self, url, name, ward):
        page = self.lxmlize(url)
        # sadly, email is a form on a separate page
        photo_url_rel = page.xpath(
            '//div[contains(@id, "contentcontainer")]//img/@src')[0]
        photo_url = urljoin(url, photo_url_rel)

        m = Person(primary_org='legislature',
                   name=name,
                   district=ward,
                   role='Councillor')
        m.add_source(COUNCIL_PAGE)
        m.add_source(url)

        phone = self.get_phone(page.xpath('//div[@id="contentcontainer"]')[0],
                               area_codes=[306],
                               error=False)
        if phone:
            m.add_contact('voice', phone, 'legislature')
        else:
            phone = self.get_phone(
                page.xpath('//div[@id="lowercontentcontainer"]')[0],
                area_codes=[306],
                error=False)
            if phone:
                m.add_contact('voice', phone, 'legislature')

        m.image = photo_url
        yield m
Example #54
0
    def scrape(self):
        page = self.lxmlize(COUNCIL_PAGE)

        councillors = page.xpath('//table[@id="Table1table"]/tbody/tr')
        assert len(councillors), 'No councillors found'
        for i, councillor in enumerate(councillors):
            role_district = councillor.xpath('./td[2]/p/text()')[0].strip()
            if 'Mayor' in role_district:
                name = role_district.replace('Mayor and Regional Councillor', '')
                role = 'Mayor'
                district = 'Milton'
            else:
                name = councillor.xpath('./td[2]/p/text()')[1]
                role, district = re.split(r' (?=Ward)', role_district)
                if role == 'Town and Regional Councillor':
                    role = 'Regional Councillor'
                elif role == 'Town Councillor':
                    role = 'Councillor'

            p = Person(primary_org='legislature', name=name, district=district, role=role)
            p.add_source(COUNCIL_PAGE)

            p.image = councillor.xpath('./td[1]/p//img/@src')[0]

            numbers = councillor.xpath('./td[3]/p[2]/text()')
            for number in numbers:
                num_type, number = number.split(':')
                number = number.replace(', ext ', ' x').strip()
                p.add_contact(num_type, number, num_type)

            yield p
Example #55
0
    def scrape(self):
        page = self.lxmlize(COUNCIL_PAGE, 'utf-8')

        yield self.scrape_mayor(page)

        trs = page.xpath('//tbody/tr')
        assert len(trs), 'No councillors found'
        seat_number = 1
        for tr in trs:
            if tr.xpath('./td[2]//text()')[0] != 'Vacant':
                district = tr.xpath('./td[1]/text()')[0]
                if 'Greenfield Park' in district or 'Conseiller n' in district:
                    district = 'Greenfield Park (siège {})'.format(seat_number)
                    seat_number += 1
                detail_url = tr.xpath('./td[2]/a/@href')[0]
                detail_page = self.lxmlize(detail_url, 'utf-8')

                name = detail_page.xpath('//h1/text()')[0]
                photo_node = detail_page.xpath(
                    '//img[contains(@alt, "{0}")]/@src'.format(name))
                if photo_node:
                    photo_url = photo_node[0]
                else:
                    photo_url = detail_page.xpath(
                        '//img[contains(@class, "droite")]/@src')[0]

                p = Person(primary_org='legislature',
                           name=name,
                           district=district,
                           role='Conseiller')
                p.add_source(COUNCIL_PAGE)
                p.add_source(detail_url)
                p.image = photo_url
                p.add_contact('email', self.get_email(detail_page))
                yield p
Example #56
0
    def scrape(self):
        page = self.lxmlize(COUNCIL_PAGE)

        councillors = page.xpath('//div[@class="block text"]')
        assert len(councillors), 'No councillors found'
        for i, councillor in enumerate(councillors):
            name = councillor.xpath(
                './/div[@class="content-writable"]//strong/text()')[0]
            district = councillor.xpath('.//h2/text()')[0]

            if 'Maire' in district:
                district = 'Sainte-Anne-de-Bellevue'
                role = 'Maire'
            else:
                district = 'District {}'.format(re.search(r'\d+', district)[0])
                role = 'Conseiller'

            p = Person(primary_org='legislature',
                       name=name,
                       district=district,
                       role=role)
            p.add_source(COUNCIL_PAGE)

            p.image = councillor.xpath('.//@src')[0]
            p.add_contact('email', self.get_email(councillor))
            yield p
Example #57
0
    def scrape(self):
        councillor_seat_number = 1

        page = self.lxmlize(COUNCIL_PAGE)
        councillors = page.xpath('//div[@id="content"]//table//tr[position() mod 2 = 1]')
        assert len(councillors), 'No councillors found'
        for councillor in councillors:
            text = councillor.xpath('.//strong/text()')[0]
            if 'Deputy Warden' in text:
                role = 'Deputy Warden'
                name = text.replace('Deputy Warden', '')
                district = 'Lambton'
            elif 'Warden' in text:
                role = 'Warden'
                name = text.replace('Warden', '')
                district = 'Lambton'
            else:
                role = 'Councillor'
                name = text
                district = 'Lambton (seat {})'.format(councillor_seat_number)
                councillor_seat_number += 1

            p = Person(primary_org='legislature', name=name, district=district, role=role)
            p.add_source(COUNCIL_PAGE)

            p.image = councillor.xpath('.//img/@src')[0]
            p.add_contact('email', self.get_email(councillor))

            yield p