Example #1
0
    def scrape(self):
        page = self.lxmlize(COUNCIL_PAGE)

        councillors = page.xpath('//h1[@class="title"]')
        for councillor in councillors:
            if ',' not in councillor.text_content():
                continue
            name, district = councillor.text_content().split(',')
            name = name.strip()
            if 'Mayor' in district:
                p = Person(primary_org='legislature', name=name, district='Beaconsfield', role='Maire')
                p.add_source(COUNCIL_PAGE)
                p.image = councillor.xpath('./parent::div/parent::div/p//img/@src')[0]
                phone = councillor.xpath('.//parent::div/following-sibling::div[contains(text(), "514")]/text()')[0]
                phone = phone.split(':')[1].strip().replace(' ', '-')
                p.add_contact('voice', phone, 'legislature')
                script = councillor.xpath('.//parent::div/following-sibling::div/script')[0].text_content()
                p.add_contact('email', get_email(script))
                yield p
                continue

            district = district.split('-')[1].strip()
            p = Person(primary_org='legislature', name=name, district=district, role='Conseiller')
            p.add_source(COUNCIL_PAGE)

            p.image = councillor.xpath('./parent::div/parent::div/p//img/@src')[0]

            phone = councillor.xpath('.//parent::div/following-sibling::p[contains(text(), "514")]/text()')
            if phone:
                phone = phone[0]
                phone = phone.split(':')[1].strip().replace(' ', '-')
                p.add_contact('voice', phone, 'legislature')
            script = councillor.xpath('.//parent::div/following-sibling::p/script')[0].text_content()
            p.add_contact('email', get_email(script))
            yield p
Example #2
0
    def scrape(self):
        page = self.lxmlize(COUNCIL_PAGE)

        councillors = page.xpath('//table/tbody/tr')[1:]
        for councillor in councillors:
            name = councillor.xpath('.//a')[0].text_content()
            if 'District ' in name:  # Vacant
                continue
            district = 'District {}'.format(councillor.xpath('.//strong')[0].text_content())

            address = councillor.xpath('.//td')[2].text_content().replace("\r\n", ', ')
            contact_nodes = councillor.xpath('.//td[4]/text()')
            if ':' not in contact_nodes[0]:
                contact_nodes = councillor.xpath('.//td[4]/p/text()')

            phone = contact_nodes[0].split(':')[1].replace("(", '').replace(") ", '-')
            if 'or' in phone:  # phone and cell
                phone = phone.split('or')[0]

            # email protected by js
            p = Person(primary_org='legislature', name=name, district=district, role='Councillor')
            p.add_source(COUNCIL_PAGE)
            p.add_contact('address', address, 'legislature')
            p.add_contact('voice', phone, 'legislature')

            if 'F' in contact_nodes[1]:
                fax = contact_nodes[1].split(':')[1].replace("(", '').replace(") ", '-')
                p.add_contact('fax', fax, 'legislature')

            councillor_url = councillor.xpath('.//a/@href')[0]
            p.add_source(councillor_url)
            page = self.lxmlize(councillor_url)
            image = page.xpath('//img[contains(@title, "{0}")]/@src'.format(name))
            if image:
                p.image = image[0]
            yield p

        mayorpage = self.lxmlize(MAYOR_PAGE)

        mayor_name_nodes = mayorpage.xpath('//p/*[contains(text(), "Mayor")]//text()')
        for node in mayor_name_nodes:
            result = re.search('Mayor ([A-Z].+ [A-Z].+[^:])', node)
            if result is not None:
                name = result.group(1)
                break

        photo_url = mayorpage.xpath('//span/img/@src')[0]
        contact_nodes = mayorpage.xpath('//aside//h3[contains(text(), "Contact")]/following-sibling::div[1]')[0]
        address = contact_nodes.xpath('.//p[1]/text()')[0]
        phone = contact_nodes.xpath('.//p[2]/text()')[0].split(': ')[1]
        fax = contact_nodes.xpath('.//p[2]/text()')[1].split(': ')[1]
        email = self.get_email(contact_nodes.xpath('.//p[3]')[0])

        p = Person(primary_org='legislature', name=name, district='Cape Breton', role='Mayor')
        p.add_source(MAYOR_PAGE)
        p.add_contact('address', address, 'legislature')
        p.add_contact('voice', phone, 'legislature')
        p.add_contact('email', email)
        p.image = photo_url
        yield p
Example #3
0
    def scrape(self):
        page = self.lxmlize(COUNCIL_PAGE)

        node = page.xpath('//td[@rowspan="2"]')[0]
        name = node.xpath('.//h3/strong/text()')[0]
        image = node.xpath('.//@src')[0]
        voice = self.get_phone(node)
        url = node.xpath('.//a[contains(., "Visit")]/@href')[0]

        p = Person(primary_org='legislature',
                   name=name,
                   district='Caledon',
                   role='Mayor')
        p.add_source(COUNCIL_PAGE)
        p.add_source(url)

        p.add_contact('voice', voice, 'legislature')
        p.add_contact('email', self.get_email(self.lxmlize(url)))
        p.image = image

        yield p

        councillors = page.xpath('//div[@id="printAreaContent"]//table[2]//td')
        councillors = councillors[:12] + councillors[16:]
        assert len(councillors), 'No councillors found'
        for i in range(len(councillors) // 3):
            i = i // 4 * 12 + i % 4
            district, role = councillors[i].xpath('.//h3/text()')
            name = councillors[i + 8].xpath('.//strong/text()')[0]
            voice = self.get_phone(councillors[i + 8])
            url = councillors[i +
                              8].xpath('.//a[contains(., "Visit")]/@href')[0]

            if 'photo to come' in councillors[i + 4].text_content():
                image = None
            else:
                image = councillors[i + 4].xpath('.//@src')[0]

            district = district.replace('\xa0', ' ')
            if ' and ' in district:
                district = district.replace('Ward ', 'Wards ')

            p = Person(primary_org='legislature',
                       name=name,
                       district=district,
                       role=role)
            p.add_source(COUNCIL_PAGE)
            p.add_source(url)

            p.add_contact('voice', voice, 'legislature')
            p.add_contact('email', self.get_email(self.lxmlize(url)))
            if image:
                p.image = image

            yield p
Example #4
0
    def scrape(self):
        root = self.lxmlize(COUNCIL_PAGE)
        everyone = root.xpath('//span[@class="Title"]')
        mayornode = everyone[0]
        mayor = {}
        spantext = ' '.join(mayornode.xpath('.//text()'))
        mayor['name'] = re.search(r'[^(]+', spantext).group(0).strip()
        mayor['photo_url'] = urljoin(COUNCIL_PAGE,
                                     mayornode.xpath('img/@src')[0])
        mayor['email'] = mayornode.xpath('following::a[1]/text()')[0]

        m = Person(primary_org='legislature',
                   name=mayor['name'],
                   district='Charlottetown',
                   role='Mayor')
        m.add_source(COUNCIL_PAGE)
        m.add_contact('email', mayor['email'])
        m.image = mayor['photo_url']

        yield m

        councillors = root.xpath('//span[@class="Title"]')[1:]
        assert len(councillors), 'No councillors found'
        for span in councillors:
            spantext = ' '.join(span.xpath('.//text()'))
            header = spantext.replace('\u2013', '-').replace('\x96',
                                                             '-').split('-')
            if len(header) != 2:
                continue

            name = header[0].strip()
            name = name.replace('Councillor', '')
            name = re.sub(r'\(.+?\)', '', name)
            name = ' '.join(name.split())

            district_id = ' '.join(header[1].split()[:2])

            # needed a wacky xpath to deal with ward 8
            photo = span.xpath('preceding::hr[1]/following::img[1]/@src')
            photo_url = urljoin(COUNCIL_PAGE, photo[0])

            email = span.xpath(
                'string(following::a[1]/text())')  # can be empty

            p = Person(primary_org='legislature',
                       name=name,
                       district=district_id,
                       role='Councillor')
            p.add_source(COUNCIL_PAGE)
            if email:
                p.add_contact('email', email)
            p.image = photo_url

            yield p
Example #5
0
    def scrape(self):
        page = self.lxmlize(COUNCIL_PAGE)

        councillors = page.xpath('//h1[@class="title"]')
        for councillor in councillors:
            if ',' not in councillor.text_content():
                continue
            name, district = councillor.text_content().split(',')
            name = name.strip()
            if 'Mayor' in district:
                p = Person(primary_org='legislature',
                           name=name,
                           district='Beaconsfield',
                           role='Maire')
                p.add_source(COUNCIL_PAGE)
                p.image = councillor.xpath(
                    './parent::div/parent::div/p//img/@src')[0]
                phone = councillor.xpath(
                    './/parent::div/following-sibling::div[contains(text(), "514")]/text()'
                )[0]
                phone = phone.split(':')[1].strip().replace(' ', '-')
                p.add_contact('voice', phone, 'legislature')
                script = councillor.xpath(
                    './/parent::div/following-sibling::div/script'
                )[0].text_content()
                p.add_contact('email', get_email(script))
                yield p
                continue

            district = district.split('-')[1].strip()
            p = Person(primary_org='legislature',
                       name=name,
                       district=district,
                       role='Conseiller')
            p.add_source(COUNCIL_PAGE)

            p.image = councillor.xpath(
                './parent::div/parent::div/p//img/@src')[0]

            phone = councillor.xpath(
                './/parent::div/following-sibling::p[contains(text(), "514")]/text()'
            )
            if phone:
                phone = phone[0]
                phone = phone.split(':')[1].strip().replace(' ', '-')
                p.add_contact('voice', phone, 'legislature')
            script = councillor.xpath(
                './/parent::div/following-sibling::p/script')[0].text_content(
                )
            p.add_contact('email', get_email(script))
            yield p
Example #6
0
    def scrape(self):
        page = self.lxmlize(COUNCIL_PAGE)

        node = page.xpath('//td[@rowspan="2"]')[0]
        name = node.xpath('.//h3/strong/text()')[0]
        image = node.xpath('.//@src')[0]
        voice = self.get_phone(node)
        url = node.xpath('.//a[contains(., "Visit")]/@href')[0]

        p = Person(primary_org='legislature', name=name, district='Caledon', role='Mayor')
        p.add_source(COUNCIL_PAGE)
        p.add_source(url)

        p.add_contact('voice', voice, 'legislature')
        p.add_contact('email', self.get_email(self.lxmlize(url)))
        p.image = image

        yield p

        councillors = page.xpath('//div[@id="printAreaContent"]//table[2]//td')
        councillors = councillors[:12] + councillors[16:]
        assert len(councillors), 'No councillors found'
        for i in range(len(councillors) // 3):
            i = i // 4 * 12 + i % 4
            district, role = councillors[i].xpath('.//h3/text()')
            name = councillors[i + 8].xpath('.//strong/text()')[0]
            voice = self.get_phone(councillors[i + 8])
            url = councillors[i + 8].xpath('.//a[contains(., "Visit")]/@href')[0]

            if 'photo to come' in councillors[i + 4].text_content():
                image = None
            else:
                image = councillors[i + 4].xpath('.//@src')[0]

            district = district.replace('\xa0', ' ')
            if ' and ' in district:
                district = district.replace('Ward ', 'Wards ')

            p = Person(primary_org='legislature', name=name, district=district, role=role)
            p.add_source(COUNCIL_PAGE)
            p.add_source(url)

            p.add_contact('voice', voice, 'legislature')
            p.add_contact('email', self.get_email(self.lxmlize(url)))
            if image:
                p.image = image

            yield p
Example #7
0
    def scrape(self):
        page = self.lxmlize(COUNCIL_PAGE)

        mayor = page.xpath('.//div[@class="item-page clearfix"]//table[1]//p')[1]
        name = mayor.xpath('.//strong/text()')[0]

        p = Person(primary_org='legislature', name=name, district='Pointe-Claire', role='Maire')
        p.add_source(COUNCIL_PAGE)

        phone = re.findall(r'[0-9]{3}[ -][0-9]{3}-[0-9]{4}', mayor.text_content())[0].replace(' ', '-')
        p.add_contact('voice', phone, 'legislature')
        yield p

        rows = page.xpath('//tr')
        for i, row in enumerate(rows):
            if i % 2 == 0:
                continue
            councillors = row.xpath('./td')
            for j, councillor in enumerate(councillors):
                name = councillor.text_content()
                # rows[i + 1].xpath('.//td//a[contains(@href, "maps")]/text()')[j] # district number
                district = rows[i + 1].xpath('.//td/p[1]/text()')[j].replace(' / ', '/')

                p = Person(primary_org='legislature', name=name, district=district, role='Conseiller')
                p.add_source(COUNCIL_PAGE)
                p.image = councillor.xpath('.//img/@src')[0]

                phone = re.findall(r'[0-9]{3}[ -][0-9]{3}-[0-9]{4}', rows[i + 1].xpath('.//td')[j].text_content())[0].replace(' ', '-')

                p.add_contact('voice', phone, 'legislature')

                yield p
Example #8
0
    def scrape(self):
        page = self.lxmlize(COUNCIL_PAGE)

        councillors = page.xpath('//div[contains(@class, "ligne")]')
        for councillor in councillors:

            name = ' '.join(councillor.xpath('.//h3')[0].text_content().strip().split(', ')[::-1])
            if 'vacant' in name:
                continue
            district = councillor.xpath('./preceding-sibling::h2/text()')[-1]
            if 'Mairie' in district:
                district = 'Québec'
                role = 'Maire'
            else:
                text = councillor.xpath('.//a[@target="_blank"]/text()')
                district = re.search('\ADistrict électoral (?:de|du|des) (.+) - ?\d+\Z', text[0].strip().replace('\xa0', ''), flags=re.U).group(1)
                role = 'Conseiller'

            if district == 'Monts':
                district = 'Les Monts'
            elif district == 'Plateau':
                district = 'Le Plateau'
            else:
                district = re.sub('–', '—', district)  # n-dash, m-dash
                district = re.sub('\Ala ', 'La ', district)

            p = Person(primary_org='legislature', name=name, district=district, role=role)
            p.add_source(COUNCIL_PAGE)
            p.image = councillor.xpath('./p//img/@src')[0]

            phone = self.get_phone(councillor, area_codes=[418])
            p.add_contact('voice', phone, 'legislature')
            yield p
Example #9
0
    def scrape(self):
        page = self.lxmlize(COUNCIL_PAGE, 'utf-8')

        yield self.scrape_mayor(page)

        trs = page.xpath('//tbody/tr')
        assert len(trs), 'No councillors found'
        seat_number = 1
        for tr in trs:
            if tr.xpath('./td[2]//text()')[0] != 'Vacant':
                district = tr.xpath('./td[1]/text()')[0]
                if 'Greenfield Park' in district or 'Conseiller n' in district:
                    district = 'Greenfield Park (siège {})'.format(seat_number)
                    seat_number += 1
                detail_url = tr.xpath('./td[2]/a/@href')[0]
                detail_page = self.lxmlize(detail_url, 'utf-8')

                name = detail_page.xpath('//h1/text()')[0]
                photo_node = detail_page.xpath(
                    '//img[contains(@alt, "{0}")]/@src'.format(name))
                if photo_node:
                    photo_url = photo_node[0]
                else:
                    photo_url = detail_page.xpath(
                        '//img[contains(@class, "droite")]/@src')[0]

                p = Person(primary_org='legislature',
                           name=name,
                           district=district,
                           role='Conseiller')
                p.add_source(COUNCIL_PAGE)
                p.add_source(detail_url)
                p.image = photo_url
                p.add_contact('email', self.get_email(detail_page))
                yield p
Example #10
0
    def scrape(self):
        page = self.lxmlize(COUNCIL_PAGE)

        councillors = page.xpath('//div[@class="article-content"]//td[@class="ms-rteTableOddCol-0"]')
        yield self.scrape_mayor(councillors[0])
        assert len(councillors), 'No councillors found'
        for councillor in councillors[1:]:
            if not councillor.xpath('.//a'):
                continue

            texts = [text for text in councillor.xpath('.//text()') if clean_string(text)]
            name = texts[0]
            district = texts[1]
            url = councillor.xpath('.//a/@href')[0]
            page = self.lxmlize(url)

            p = Person(primary_org='legislature', name=name, district=district, role='Conseiller')
            p.add_source(COUNCIL_PAGE)
            p.add_source(url)

            p.image = councillor.xpath('./preceding-sibling::td//img/@src')[-1]

            contacts = page.xpath('.//td[@class="ms-rteTableOddCol-0"]//text()')
            for contact in contacts:
                if re.findall(r'[0-9]{4}', contact):
                    phone = contact.strip().replace(' ', '-')
                    p.add_contact('voice', phone, 'legislature')
            get_links(p, page.xpath('.//td[@class="ms-rteTableOddCol-0"]')[0])

            email = self.get_email(page)
            p.add_contact('email', email)
            yield p
Example #11
0
    def scrape(self):
        member_page = self.lxmlize(COUNCIL_PAGE, encoding='utf-8')
        table = member_page.xpath('//table')[0]
        rows = table.xpath('.//tr')[1:]
        assert len(rows), 'No members found'
        for row in rows:
            (namecell, constitcell, partycell) = row.xpath('.//td')
            full_name = namecell.text_content().strip()
            if full_name.lower() == 'vacant':
                continue
            (last, first) = full_name.split(',')
            name = first.replace('Hon.',
                                 '').strip() + ' ' + last.title().strip()
            district = ' '.join(constitcell.text_content().split())
            party = get_party(partycell.text)

            url = namecell.xpath('.//a')[0].get('href')

            page = self.lxmlize(url)
            email = self.get_email(page)

            p = Person(primary_org='legislature',
                       name=name,
                       district=district,
                       role='MLA',
                       party=party)
            p.add_source(COUNCIL_PAGE)
            p.add_source(url)
            p.add_contact('email', email)

            image = page.xpath('//img[@class="page_graphic"]/@src')
            if image:
                p.image = image[0]

            yield p
Example #12
0
    def scrape(self):
        page = self.lxmlize(COUNCIL_PAGE, 'iso-8859-1')

        general_contacts = page.xpath('//p[@class="large_title"]/following-sibling::p/text()')
        general_phone = general_contacts[0]
        general_fax = general_contacts[1]

        councillors = page.xpath('//tr/td/p/b')
        for councillor in councillors:
            text = councillor.text_content()
            if '@' in text or 'NEWSLETTER' in text:
                continue

            if 'Mayor' in text:
                name = text.replace('Mayor', '')
                district = 'Dollard-Des Ormeaux'
                role = 'Maire'
            else:
                name = re.split(r'[0-9]', text)[1]
                district = 'District ' + re.findall(r'[0-9]', text)[0]
                role = 'Conseiller'

            p = Person(primary_org='legislature', name=name, district=district, role=role)
            p.add_source(COUNCIL_PAGE)
            p.image = councillor.xpath('./parent::p/parent::td/parent::tr/preceding-sibling::tr//img/@src')[0]

            email = self.get_email(councillor, './parent::p/following-sibling::p')
            p.add_contact('email', email)

            p.add_contact('voice', general_phone, 'legislature')
            p.add_contact('fax', general_fax, 'legislature')

            yield p
Example #13
0
    def scrape(self):
        page = self.lxmlize(COUNCIL_PAGE, user_agent=CUSTOM_USER_AGENT)

        mayor_url = page.xpath('//a[contains(text(), "Mayor")]/@href')[0]
        mayor = self.scrape_mayor(mayor_url)
        if mayor:
            yield mayor

        councillors_url = page.xpath('//a[contains(text(), "Councillors")]/@href')[0]
        cpage = self.lxmlize(councillors_url, user_agent=CUSTOM_USER_AGENT)

        councillors = cpage.xpath('//tr[td//img]')[:-1]

        assert len(councillors), 'No councillors found'
        for councillor_row in councillors:
            img_cell, info_cell = tuple(councillor_row)
            if info_cell.xpath('.//p//text()[contains(., "Vacant")]'):
                continue
            cells = [x.strip() for x in info_cell.xpath('.//text()') if re.sub('\xa0', ' ', x).strip()]
            name = cells[0].replace('Councillor ', '')
            district = info_cell.xpath('.//p[contains(text(), "District")]//text()')[0]
            email = self.get_email(info_cell)
            phone = self.get_phone(info_cell, area_codes=[438, 514], error=False)
            img_url_rel = img_cell.xpath('.//img/@src')[0]
            img_url = urljoin(councillors_url, img_url_rel)

            p = Person(primary_org='legislature', name=name, district=district, role='Conseiller')
            p.add_source(COUNCIL_PAGE)
            p.add_source(councillors_url)
            p.add_contact('email', email)
            if phone:
                p.add_contact('voice', phone, 'legislature')
            p.image = img_url
            yield p
Example #14
0
    def scrape(self):
        page = self.lxmlize(COUNCIL_PAGE)

        councillors = page.xpath('//table[@id="Table1table"]/tbody/tr')
        assert len(councillors), 'No councillors found'
        for councillor in councillors:
            name = councillor.xpath('./td[2]/p/text()')[1]
            role = councillor.xpath('./td[2]/p/text()')[0].strip()
            if role == 'Mayor and Regional Councillor':
                role = 'Mayor'
            elif role == 'Local & Regional Councillor':
                role = 'Regional Councillor'
            elif role == 'Local Councillor':
                role = 'Councillor'
            if len(councillor.xpath('./td[2]/p/text()')) < 3:
                district = 'Milton'
            else:
                district = councillor.xpath('./td[2]/p/text()')[2]

            p = Person(primary_org='legislature', name=name, district=district, role=role)
            p.add_source(COUNCIL_PAGE)

            p.image = councillor.xpath('./td[1]/p//img/@src')[0]

            if councillor == councillors[0]:
                address = ', '.join(councillor.xpath('./td[3]/p[1]/text()')).replace('Email:', '').strip()
                p.add_contact('address', address, 'legislature')

            numbers = councillor.xpath('./td[3]/p[2]/text()')
            for number in numbers:
                num_type, number = number.split(':')
                number = number.replace(', ext ', ' x').strip()
                p.add_contact(num_type, number, num_type)

            yield p
Example #15
0
    def scrape(self):
        page = self.lxmlize(COUNCIL_PAGE)

        councillors = page.xpath('//div[@class="member-container"]')
        for councillor in councillors:
            name = councillor.xpath('.//h3')[0].text_content()
            role = councillor.xpath(
                './/div[@class="member-position"]')[0].text_content()
            if 'Maire' in role:
                role = 'Maire'
                district = 'Westmount'
            else:
                role = 'Conseiller'
                district = councillor.xpath(
                    './/div[@class="entry-content"]/text()')[0]

            p = Person(primary_org='legislature',
                       name=name,
                       district=district,
                       role=role)
            p.add_source(COUNCIL_PAGE)

            p.image = councillor.xpath(
                './/a[@title="Photo pour la presse"]/@href')[0]
            p.add_contact('email', self.get_email(councillor))

            yield p
Example #16
0
    def scrape(self):
        page = self.lxmlize(COUNCIL_PAGE, encoding='utf-8')
        councillors = page.xpath('//div[contains(@class," inner_member")]')
        assert len(councillors), 'No councillors found'

        for councillor in councillors:
            name = councillor.xpath('.//h2/text()')[0]
            district = councillor.xpath(
                './/div[contains(@class,"district")]/text()')[0].replace(
                    'numéro ', '')

            if 'Maire' in district:
                district = 'Saint-Jérôme'
                role = 'Maire'
            else:
                role = 'Conseiller'

            image = councillor.xpath(
                './/div[@class="portrait_single"]/img/@data-lazy-src')[0]
            contact = councillor.xpath(
                './/div[contains(@class,"phone")]/text()')[0]

            p = Person(primary_org='legislature',
                       name=name,
                       district=district,
                       role=role)
            p.add_source(COUNCIL_PAGE)
            p.image = image

            p.add_contact('voice', contact, 'legislature')
            p.add_contact('email', self.get_email(councillor))

            yield p
Example #17
0
    def scrape(self):
        page = self.lxmlize(COUNCIL_PAGE)

        councillor_trs = [tr for tr in page.xpath('//table//tr[1]') if len(tr) == 2][:-1]
        for councillor_tr in councillor_trs:
            desc = [text.strip() for text in councillor_tr.xpath('.//text()[normalize-space()]') if text.strip()]

            if len(desc) == 3:
                role = 'Maire'
                district = 'Saint-Jérôme'
            else:
                role = 'Conseiller'
                district = desc[0].replace('numéro ', '')

            name = desc[-3]
            phone = desc[-2]
            email = desc[-1]

            image = councillor_tr.xpath('.//img/@src')[0]

            p = Person(primary_org='legislature', name=name, district=district, role=role)
            p.add_source(COUNCIL_PAGE)
            p.image = image
            p.add_contact('voice', phone, 'legislature')
            p.add_contact('email', email)
            yield p
Example #18
0
    def scrape(self):
        page = self.lxmlize(COUNCIL_PAGE)
        members = page.xpath('//table[1]//tr')

        assert len(members), 'No members found'
        for member in members:
            if not member.text_content().strip():
                continue

            name = member.xpath('./td[2]//a[1]//text()')[0]

            district_name = member.xpath('./td[2]//a[contains(.//text(), "MLA")]//text()')[0].split(':')[1].replace('St ', 'St. ').split('-')
            district = district_name[0].strip() + '-' + district_name[1].strip()
            url = member.xpath('./td[2]//a[1]/@href')[0]
            ext_infos = self.scrape_extended_info(url)
            p = Person(primary_org='legislature', name=name, district=district, role='MLA')
            p.add_source(COUNCIL_PAGE)
            p.add_source(url)

            if ext_infos:  # member pages might return errors
                email, phone, photo_url = ext_infos
                p.image = photo_url
                if email:
                    p.add_contact('email', email)
                if phone:
                    p.add_contact('voice', phone, 'legislature')
            yield p
Example #19
0
    def scrape(self):
        page = self.lxmlize(COUNCIL_PAGE)

        councillors = page.xpath('//table[@id="Table1table"]/tbody/tr')
        assert len(councillors), 'No councillors found'
        for i, councillor in enumerate(councillors):
            role_district = councillor.xpath('./td[2]/p/text()')[0].strip()
            if 'Mayor' in role_district:
                name = role_district.replace('Mayor and Regional Councillor',
                                             '')
                role = 'Mayor'
                district = 'Milton'
            else:
                name = councillor.xpath('./td[2]/p/text()')[1]
                role, district = re.split(r' (?=Ward)', role_district)
                if role == 'Town and Regional Councillor':
                    role = 'Regional Councillor'
                elif role == 'Town Councillor':
                    role = 'Councillor'

            p = Person(primary_org='legislature',
                       name=name,
                       district=district,
                       role=role)
            p.add_source(COUNCIL_PAGE)

            p.image = councillor.xpath('./td[1]/p//img/@src')[0]

            numbers = councillor.xpath('./td[3]/p[2]/text()')
            for number in numbers:
                num_type, number = number.split(':')
                number = number.replace(', ext ', ' x').strip()
                p.add_contact(num_type, number, num_type)

            yield p
Example #20
0
    def scrape(self):
        page = self.lxmlize(COUNCIL_PAGE, 'iso-8859-1')

        yield self.scrape_mayor()

        councillors = page.xpath('//div[@class="articlebody-inside"]//p[contains(text(),"-")]')
        for councillor in councillors:
            url = councillor.xpath('.//a')[0].attrib['href'].replace('../', '')
            page = self.lxmlize(url, 'iso-8859-1')

            name = page.xpath('//div[@class="articletitle"]/h1')[0].text_content().replace('Councillor', '').replace('Deputy Mayor', '')
            district = 'Ward {}'.format(re.sub('\D+', '', page.xpath('//div[@class="articlebody-inside"]/p')[0].text_content()))

            p = Person(primary_org='legislature', name=name, district=district, role='Councillor')
            p.add_source(COUNCIL_PAGE)
            p.add_source(url)

            photo_url_rel = page.xpath('//div[@class="articlebody-inside"]/p/img/@src')[0].replace('/..', '')
            p.image = urljoin(url, photo_url_rel)

            contacts = page.xpath('//div[@class="articlebody-inside"]/p')[1].text_content().replace('Biography', '').replace('Committees', '').split(':')
            for i, contact in enumerate(contacts):
                if i == 0 or not contact:
                    continue
                contact_type = re.findall(r'([A-Z][a-z]+)', contacts[i - 1])[0]
                if contact_type != 'Address':
                    contact = re.split(r'[A-Z]', contact)[0]
                contact_type = CONTACT_DETAIL_TYPE_MAP[contact_type]
                p.add_contact(contact_type, contact, '' if contact_type == 'email' else 'legislature')
            yield p
Example #21
0
    def scrape(self):
        page = self.lxmlize(COUNCIL_PAGE)

        councillors = page.xpath('//table[@class="councilTable"]')
        assert len(councillors), 'No councillors found'
        for councillor in councillors:
            image = councillor.xpath('.//@src')[0]
            alt = councillor.xpath('.//tr/td[1]/p[1]/img/@alt')[0]

            if 'Mayor' in alt:
                name = alt
                district = 'Ajax'
                role = 'Mayor'
            else:
                name, rest = alt.split(' - ', 1)
                district = rest.split('Councillor ', 1)[-1].strip()
                role = rest.split('Ward ', 1)[0].strip()

            cell = councillor.xpath('.//p[contains(.,"Cel")]/text()')[0].replace('\xa0', ' ')
            voice = councillor.xpath('.//p[contains(.,"Cel")]/text()')[1]
            email = self.get_email(councillor)

            p = Person(primary_org='legislature', name=name, district=district, role=role)
            p.add_source(COUNCIL_PAGE)
            p.image = image

            if cell:
                p.add_contact('cell', cell, 'legislature')
            if voice:
                p.add_contact('voice', voice, 'legislature')
            if email:
                p.add_contact('email', email)
            yield p
Example #22
0
    def scrape(self):
        page = self.lxmlize(COUNCIL_PAGE)

        sections = page.xpath('//div[contains(@class, "membres-conseil-municipal")]')
        for section in sections:
            councillors = section.xpath('./div')
            assert len(councillors), 'No councillors found'
            for councillor in councillors:
                name = ' '.join(reversed(councillor.xpath('./h3//text()')))
                if 'vacant' in name.lower():
                    continue

                header = section.xpath('./preceding-sibling::h2/text()')[-1]
                if 'Mairie' in header:
                    district = 'Québec'
                    role = 'Maire'
                else:
                    district = councillor.xpath('./p[@itemprop="jobTitle"]/a/text()')[0]
                    district = re.search(r'\ADistrict (?:de(?: la)?|du|des) ([\w —–-]+)', district, flags=re.U).group(1)
                    role = 'Conseiller'

                if district == 'Saules':
                    district = 'Les Saules'
                else:
                    district = re.sub(r'–', '—', district)  # n-dash, m-dash

                p = Person(primary_org='legislature', name=name, district=district, role=role)
                p.add_source(COUNCIL_PAGE)
                p.image = councillor.xpath('./figure//@src')[0]
                p.add_contact('voice', self.get_phone(councillor, area_codes=[418]), 'legislature')
                yield p
Example #23
0
    def scrape(self):
        councillor_seat_number = 1

        page = self.lxmlize(COUNCIL_PAGE)
        councillors = page.xpath('//div[contains(@class, "entry")]')[0].xpath('.//@href')
        assert len(councillors), 'No councillors found'
        for url in councillors:
            if '@' in url:
                continue

            page = self.lxmlize(url)
            main = page.xpath('//main[@id="content"]')[0]

            name = main.xpath('.//h1//text()')[0]

            if 'Mayor' in main.text_content():
                name = name.replace('Mayor ', '')
                role = 'Mayor'
                district = 'Saanich'
            else:
                role = 'Councillor'
                district = 'Saanich (seat {})'.format(councillor_seat_number)
                councillor_seat_number += 1

            p = Person(primary_org='legislature', name=name, district=district, role=role)
            p.image = page.xpath('.//@src')[0]
            p.add_contact('voice', self.get_phone(page, area_codes=[250]), 'legislature')
            p.add_contact('email', self.get_email(page.xpath('//main[@id="content"]')[0]))
            p.add_source(COUNCIL_PAGE)
            p.add_source(url)
            yield p
Example #24
0
    def scrape(self):
        page = self.lxmlize(COUNCIL_PAGE)
        for block in page.xpath('//div[@class="addressblock"]'):
            name_elem = block.xpath('.//a[@class="mpp"]')[0]
            name = ' '.join(name_elem.text.split())

            riding = block.xpath('.//div[@class="riding"]//text()')[0].strip().replace('--', '\u2014')
            district = riding.replace('Chatham—Kent', 'Chatham-Kent')  # m-dash to hyphen
            mpp_url = name_elem.attrib['href']

            mpp_page = self.lxmlize(mpp_url)

            image = mpp_page.xpath('//img[@class="mppimg"]/@src')
            party = mpp_page.xpath('//div[@class="mppinfoblock"]/p[last()]/text()')[0].strip()

            p = Person(primary_org='legislature', name=name, district=district, role='MPP', party=party)
            if image:
                p.image = image[0]
            p.add_source(COUNCIL_PAGE)
            p.add_source(mpp_url)

            email = block.xpath('.//div[@class="email"]')
            if email:
                p.add_contact('email', self.get_email(email[0]))

            phone = block.xpath('.//div[@class="phone"]//text()')
            if phone:
                p.add_contact('voice', phone[0], 'legislature')

            yield p
Example #25
0
    def scrape(self):

        page = self.lxmlize(COUNCIL_PAGE)

        councillors = page.xpath('//p[@class="WSIndent"]/a')
        for councillor in councillors:
            district = re.findall(r'(Ward [0-9]{1,2})',
                                  councillor.text_content())
            if district:
                district = district[0]
                name = councillor.text_content().replace(district, '').strip()
                role = 'Councillor'
            else:
                district = 'Kawartha Lakes'
                name = councillor.text_content().replace('Mayor', '').strip()
                role = 'Mayor'

            url = councillor.attrib['href']
            page = self.lxmlize(url)
            email = self.get_email(page)
            image = page.xpath('//img[@class="image-right"]/@src')[0]

            p = Person(primary_org='legislature',
                       name=name,
                       district=district,
                       role=role)
            p.add_source(COUNCIL_PAGE)
            p.add_source(url)
            p.add_contact('email', email)
            p.image = image
            yield p
Example #26
0
    def scrape(self):
        councillor_seat_number = 1

        coun_page = self.lxmlize(COUNCIL_PAGE)
        contact_page = self.lxmlize(CONTACT_PAGE)
        councillors = coun_page.xpath('//div[@id="main-content"]//h3')
        contact_data = contact_page.xpath('//p[contains(./strong/text(), "Mayor & Council")]/following-sibling::table[1]//tr')[1:]

        for councillor, contact in zip(councillors, contact_data):
            text = councillor.text_content()
            if text.startswith('Councill'):
                role = 'Councillor'
                district = 'Abbotsford (seat {})'.format(councillor_seat_number)
                councillor_seat_number += 1
            else:
                role = 'Mayor'
                district = 'Abbotsford'
            name = text.split(' ', 1)[1]
            image = councillor.xpath('./img/@src')[0]
            phone = contact.xpath('./td[2]/text()')[0]
            fax = contact.xpath('./td[3]/text()')[0]

            p = Person(primary_org='legislature', name=name, district=district, role=role)
            p.add_source(COUNCIL_PAGE)
            p.add_source(CONTACT_PAGE)
            p.image = image
            p.add_contact('voice', phone, 'legislature')
            p.add_contact('fax', fax, 'legislature')

            yield p
Example #27
0
    def scrape(self):
        councillor_seat_number = 1

        contact_page = self.lxmlize(CONTACT_URL)
        email = self.get_email(contact_page)

        page = self.lxmlize(COUNCIL_PAGE)
        for url in page.xpath('//a/@href[contains(., "members/")]'):
            page = self.lxmlize(url)
            role, name = page.xpath('//h1//text()')[0].split(' ', 1)
            photo_url = page.xpath('//img/@src')[0]

            if role == 'Mayor':
                district = 'Richmond'
            else:
                district = 'Richmond (seat {})'.format(councillor_seat_number)
                councillor_seat_number += 1

            p = Person(primary_org='legislature', name=name, district=district, role=role)
            p.image = photo_url
            p.add_source(COUNCIL_PAGE)
            p.add_source(CONTACT_URL)
            p.add_source(url)
            p.add_contact('email', email)
            yield p
Example #28
0
    def scrape(self):
        councillor_seat_number = 1

        page = self.lxmlize(COUNCIL_PAGE)
        councillors = page.xpath('//div[@id="content"]//table//tr[position() mod 2 = 1]')
        assert len(councillors), 'No councillors found'
        for councillor in councillors:
            text = councillor.xpath('.//strong/text()')[0]
            if 'Deputy Warden' in text:
                role = 'Deputy Warden'
                name = text.replace('Deputy Warden', '')
                district = 'Lambton'
            elif 'Warden' in text:
                role = 'Warden'
                name = text.replace('Warden', '')
                district = 'Lambton'
            else:
                role = 'Councillor'
                name = text
                district = 'Lambton (seat {})'.format(councillor_seat_number)
                councillor_seat_number += 1

            p = Person(primary_org='legislature', name=name, district=district, role=role)
            p.add_source(COUNCIL_PAGE)

            p.image = councillor.xpath('.//img/@src')[0]
            p.add_contact('email', self.get_email(councillor))

            yield p
Example #29
0
    def scrape(self):
        page = self.lxmlize(COUNCIL_PAGE)

        councillors = page.xpath('//div[@id="printArea"]//strong')
        for councillor in councillors:
            info = councillor.xpath('./parent::p/text()')
            if not info:
                info = councillor.xpath('./parent::div/text()')
            info = [x for x in info if x.strip()]
            district = re.sub(r'(?<=Ward \d).+', '', info.pop(0))
            if 'Mayor' in district:
                district = 'Woolwich'
                role = 'Mayor'
            else:
                district = district.replace('Councillor', '').strip()
                role = 'Councillor'

            p = Person(primary_org='legislature', name=councillor.text_content(), district=district, role=role)
            p.add_source(COUNCIL_PAGE)
            p.image = councillor.xpath('./img/@src')[0]

            for contact in info:
                note, num = contact.split(':')
                num = num.strip().replace('(', '').replace(') ', '-').replace('extension ', 'x')
                p.add_contact(note, num, note)
            yield p
Example #30
0
    def scrape(self):
        page = self.lxmlize(COUNCIL_PAGE)

        councillors = page.xpath('//div[@class="field-item even"]//tr')
        assert len(councillors), 'No councillors found'
        for councillor in councillors:
            district = councillor.xpath('./td[1]//strong/text()')[0].replace(
                'no. ', '')
            role = 'Conseiller'
            if 'Maire' in district:
                district = 'Senneville'
                role = 'Maire'
            name = councillor.xpath('./td[2]//p//text()')[0].title()
            email = self.get_email(councillor)
            p = Person(primary_org='legislature',
                       name=name,
                       district=district,
                       role=role)
            p.add_source(COUNCIL_PAGE)
            try:
                p.image = councillor.xpath('.//img/@src')[0]
            except IndexError:
                pass
            p.add_contact('email', email)
            yield p
Example #31
0
    def scrape(self):
        page = self.lxmlize(COUNCIL_PAGE)
        members = page.xpath('//table[1]//tr')

        assert len(members), 'No members found'
        for member in members:
            if not member.text_content().strip():
                continue

            name = member.xpath('./td[2]//a[1]//text()')[0]

            district_name = member.xpath(
                './td[2]//a[contains(.//text(), "MLA")]//text()')[0].split(
                    ':')[1].replace('St ', 'St. ').split('-')
            district = district_name[0].strip() + '-' + district_name[1].strip(
            )
            url = member.xpath('./td[2]//a[1]/@href')[0]
            ext_infos = self.scrape_extended_info(url)
            p = Person(primary_org='legislature',
                       name=name,
                       district=district,
                       role='MLA')
            p.add_source(COUNCIL_PAGE)
            p.add_source(url)

            if ext_infos:  # member pages might return errors
                email, phone, photo_url = ext_infos
                p.image = photo_url
                if email:
                    p.add_contact('email', email)
                if phone:
                    p.add_contact('voice', phone, 'legislature')
            yield p
Example #32
0
    def scrape(self):
        def char(code):
            try:
                return chr(int(code))
            except ValueError:
                return code

        page = self.lxmlize(COUNCIL_PAGE)
        for row in page.xpath('//div[@id="content"]/table/tbody/tr'):
            if 'Vacant' not in row.xpath('./td//text()')[0]:
                full_name, party, district = row.xpath('./td//text()')[:3]
                name = ' '.join(reversed(full_name.split(',')))

                p = Person(primary_org='legislature', name=name, district=district, role='MLA', party=self.PARTIES[party])

                detail_url = row[0][0].attrib['href']
                detail = self.lxmlize(detail_url)

                image = detail.xpath('//img[@class="portrait"]/@src')[0]
                p.image = image

                try:
                    p.add_contact('voice', detail.xpath('//dd[@class="numbers"]/text()')[0].split(': ')[1], 'legislature')
                except IndexError:
                    pass

                script = detail.xpath('//dd/script/text()')
                if script:
                    codes = reversed(re.findall(r"]='(.+?)'", script[0]))
                    content = ''.join(char(code) for code in codes)
                    p.add_contact('email', re.search(r'>(.+)<', content).group(1))

                p.add_source(COUNCIL_PAGE)
                p.add_source(detail_url)
                yield p
Example #33
0
    def scrape(self):
        page = self.lxmlize(COUNCIL_PAGE)

        councillors = page.xpath(
            '//section[contains(@id, "js-council-member")]')
        assert len(councillors), 'No councillors found'
        for index, councillor in enumerate(councillors):
            name = ' '.join(councillor.xpath('.//h2/text()'))
            district = councillor.xpath(
                './/span[contains(@class, "c-info-list_label")][contains(text(), "District ")]'
            )
            role = 'Conseiller'

            if not district and index == 0:
                district = 'Pointe-Claire'
                role = 'Maire'
            elif district:
                district = district[0].text_content().split(' – ')[0]

            p = Person(primary_org='legislature',
                       name=name,
                       district=district,
                       role=role)
            p.image = councillor.xpath('.//@src')[0]
            p.add_contact('email', self.get_email(councillor))
            p.add_contact('voice', self.get_phone(councillor,
                                                  area_codes=[514]),
                          'legislature')
            p.add_source(COUNCIL_PAGE)
            yield p
Example #34
0
    def scrape(self):
        regional_councillor_seat_number = 1

        page = self.lxmlize(COUNCIL_PAGE)

        councillors = page.xpath('//center/center//a')
        for councillor in councillors:
            name = councillor.text_content().strip()
            url = councillor.attrib['href']
            page = self.lxmlize(url)
            header = page.xpath(
                '//div[@class="sectionheading"]')[0].text_content()
            if header == 'Mayor of Richmond Hill':
                district = 'Richmond Hill'
                role = 'Mayor'
            else:
                district = re.findall(r',(.*)-', header)
                if district:
                    district = district[0].strip()
                else:
                    district = 'Richmond Hill (seat {})'.format(
                        regional_councillor_seat_number)
                    regional_councillor_seat_number += 1

                role = 'Regional Councillor' if 'Regional' in header else 'Councillor'

            info = page.xpath(
                '//table[@cellpadding>0]/tbody/tr/td[last()]|//table[not(@cellpadding)]/tbody/tr/td[last()]'
            )
            info = info[0].text_content().replace(' - office:', ':')

            address = re.findall(
                r'(?<=Town of Richmond Hill)(.*(?=Telephone:)|(?=Telephone))',
                info)[0]
            address = re.sub(r'([a-z])([A-Z])', r'\1 \2', address)
            # I expected to be able to do '(.*)(?=\sTelephone|Telephone|Fax)', but nope.
            phone = re.findall(
                r'(?<=Telephone:) ((.*) (?=Telephone)|(.*)(?=Telephone)|(.*)(?=Fax))',
                info)[0][0].replace('(',
                                    '').replace(') ',
                                                '-').replace(', ext. ', ' x')
            fax = re.findall(r'(?<=Fax:) (.*)(?=E-mail)', info)[0].replace(
                ' ', '').replace('(', '').replace(')', '-')
            email = self.get_email(page)

            p = Person(primary_org='legislature',
                       name=name,
                       district=district,
                       role=role)
            p.add_source(COUNCIL_PAGE)
            p.add_source(url)
            p.add_contact('address', address, 'legislature')
            p.add_contact('voice', phone, 'legislature')
            p.add_contact('fax', fax, 'legislature')
            p.add_contact('email', email)
            p.image = page.xpath(
                '//img[contains(@alt, "{}")]/@src'.format(name))[0]
            if 'Website' in info:
                p.add_link(re.findall(r'www\..*\.[a-z]+', info)[0])
            yield p
Example #35
0
    def scrape(self):
        page = self.lxmlize(COUNCIL_PAGE)

        councillors = page.xpath('//div[@id="large_content"]//td/p[2]')
        assert len(councillors), 'No councillors found'
        for councillor in councillors:
            info = councillor.xpath('./strong/text()')

            # In case the name spans on 2 lines
            if len(info) > 2 and 'Councillor' not in info[1]:
                role, district = info[2].split('-')
                info = [info[0] + info[1], role, district]

            name = info[0]

            if 'Vacant' not in info:
                if len(info) < 3:
                    district = 'Dorval'
                    role = 'Maire'
                else:
                    district = info[2]
                    role = 'Conseiller'
                p = Person(primary_org='legislature', name=name, district=district, role=role)
                p.add_source(COUNCIL_PAGE)

                p.image = councillor.xpath('./preceding-sibling::p/img/@src')[0]

                email = self.get_email(councillor)
                p.add_contact('email', email)

                yield p
Example #36
0
    def scrape(self):
        page = self.lxmlize(COUNCIL_PAGE, 'utf-8')
        councillors = page.xpath('//td[@width="105"]')
        assert len(councillors), 'No councillors found'
        for node in councillors:
            url = urljoin(COUNCIL_PAGE, node.xpath('.//a/@href')[0])
            ward = re.search('([A-Z].+) Ward',
                             node.xpath('.//a//text()')[0]).group(1)
            ward = ward.replace(' – ', '—').replace(
                ' - ', '—')  # n-dash, m-dash, hyphen, m-dash
            ward = ward.replace('St. Norbert',
                                'St Norbert')  # to match ocd-division-ids
            name = ' '.join(node.xpath('.//span[@class="k80B"][1]/text()'))
            yield self.councillor_data(url, name, ward)

        mayor_node = page.xpath('//td[@width="315"]')[0]
        mayor_name = mayor_node.xpath('./a//text()')[0][len('Mayor '):]
        mayor_photo_url = mayor_node.xpath('./img/@src')[0]
        m = Person(primary_org='legislature',
                   name=mayor_name,
                   district='Winnipeg',
                   role='Mayor')
        m.add_source(COUNCIL_PAGE)
        # @see http://www.winnipeg.ca/interhom/mayor/MayorForm.asp?Recipient=CLK-MayorWebMail
        m.add_contact('email', '*****@*****.**')  # hardcoded
        m.image = mayor_photo_url
        yield m
Example #37
0
    def scrape(self):
        member_page = self.lxmlize(COUNCIL_PAGE, encoding='utf-8')
        table = member_page.xpath('//table')[0]
        rows = table.xpath('.//tr')[1:]
        assert len(rows), 'No members found'
        for row in rows:
            (namecell, constitcell, partycell) = row.xpath('.//td')
            full_name = namecell.text_content().strip()
            if full_name.lower() == 'vacant':
                continue
            (last, first) = full_name.split(',')
            name = first.replace('Hon.', '').strip() + ' ' + last.title().strip()
            district = ' '.join(constitcell.text_content().split())
            party = get_party(partycell.text)

            url = namecell.xpath('.//a')[0].get('href')

            page = self.lxmlize(url)
            email = self.get_email(page)

            p = Person(primary_org='legislature', name=name, district=district, role='MLA', party=party)
            p.add_source(COUNCIL_PAGE)
            p.add_source(url)
            p.add_contact('email', email)

            image = page.xpath('//img[@class="page_graphic"]/@src')
            if image:
                p.image = image[0]

            yield p
Example #38
0
    def scrape(self):
        page = self.lxmlize(COUNCIL_PAGE)

        councillors = page.xpath(
            '//div[contains(@class, "councillorwrapper")]')
        assert len(councillors), 'No councillors found'
        for index, councillor in enumerate(councillors):
            name = councillor.xpath('.//h4/text()')[0]
            district = councillor.xpath('.//h4/span/text()')[0].strip()
            role = 'Councillor'
            email = None

            if not district and index == 0:
                district = 'Calgary'
                role = 'Mayor'
                email = '*****@*****.**'

            p = Person(primary_org='legislature',
                       name=name,
                       district=district,
                       role=role)
            p.image = councillor.xpath('.//@src')[0]
            if email:
                p.add_contact('email', email)
            p.add_source(COUNCIL_PAGE)
            yield p
Example #39
0
    def scrape(self):
        seat_numbers = defaultdict(int)

        page = self.lxmlize(COUNCIL_PAGE)

        yield self.scrape_mayor()

        councillors = page.xpath('//div[@id="centre_content"]//tr')
        for councillor in councillors:
            if 'Position' in councillor.text_content():
                continue

            ward = councillor.xpath('./td')[0].text_content().replace('Councillor', '')
            seat_numbers[ward] += 1
            district = '{} (seat {})'.format(ward, seat_numbers[ward])
            name = councillor.xpath('./td')[1].text_content()
            url = councillor.xpath('./td/a')[0].attrib['href']

            p = Person(primary_org='legislature', name=name, district=district, role='Councillor')
            p.add_source(COUNCIL_PAGE)
            p.add_source(url)

            page = self.lxmlize(url)

            content = page.xpath('//div[@id="centre_content"]')[0]
            email = self.get_email(content)
            p.add_contact('email', email)
            p.add_contact('voice', self.get_phone(content, area_codes=[226, 519]), 'legislature')

            p.image = page.xpath('string(//div[@id="centre_content"]//img/@src)')  # can be empty

            if len(page.xpath('//div[@id="centre_content"]//a')) > 2:
                p.add_link(page.xpath('//div[@id="centre_content"]//a')[-1].attrib['href'])
            yield p
Example #40
0
    def scrape(self):
        page = self.lxmlize(COUNCIL_PAGE)
        nodes = page.xpath('//div[contains(@class,"cocis-has-caption")]')[1:]
        for node in nodes:
            url = urljoin(COUNCIL_PAGE, node.xpath('.//a[1]/@href')[0])
            name = node.xpath('.//a//text()')[0]
            ward = ' '.join(node.xpath('.//strong//text()')[0].split()[:-1])
            yield self.councillor_data(url, name, ward)

        mayor_node = page.xpath(
            '//div[contains(@class, "cocis-image-panel")]')[0]
        photo_url = urljoin(COUNCIL_PAGE, mayor_node.xpath('.//img/@src')[0])
        name = mayor_node.xpath('.//a//text()')[0]
        mayor_page = self.lxmlize(MAYOR_PAGE)
        # Email behind mailhide
        # email = self.get_email(mayor_page)
        phone = self.get_phone(mayor_page, area_codes=[403])
        m = Person(primary_org='legislature',
                   name=name,
                   district='Calgary',
                   role='Mayor')
        m.add_source(COUNCIL_PAGE)
        m.add_source(MAYOR_PAGE)
        m.add_contact('voice', phone, 'legislature')
        m.image = photo_url
        yield m
Example #41
0
    def scrape(self):
        councillor_seat_number = 1

        page = self.lxmlize(COUNCIL_PAGE)
        councillors = page.xpath(
            '//div[@id="content"]//table//tr[position() mod 2 = 1]')
        assert len(councillors), 'No councillors found'
        for councillor in councillors:
            text = councillor.xpath('.//strong/text()')[0]
            if 'Deputy Warden' in text:
                role = 'Deputy Warden'
                name = text.replace('Deputy Warden', '')
                district = 'Lambton'
            elif 'Warden' in text:
                role = 'Warden'
                name = text.replace('Warden', '')
                district = 'Lambton'
            else:
                role = 'Councillor'
                name = text
                district = 'Lambton (seat {})'.format(councillor_seat_number)
                councillor_seat_number += 1

            p = Person(primary_org='legislature',
                       name=name,
                       district=district,
                       role=role)
            p.add_source(COUNCIL_PAGE)

            p.image = councillor.xpath('.//img/@src')[0]
            p.add_contact('email', self.get_email(councillor))

            yield p
Example #42
0
    def scrape_mayor(self):
        page = self.lxmlize(MAYOR_PAGE, 'iso-8859-1')

        name = page.xpath(
            '//div[@class="articletitle"]/h1')[0].text_content().replace(
                'Mayor', '')

        p = Person(primary_org='legislature',
                   name=name,
                   district='Summerside',
                   role='Mayor')
        p.add_source(MAYOR_PAGE)
        p.image = page.xpath(
            '//div[@class="articlebody-inside"]/p/img/@src')[0].replace(
                '..', '')

        info = page.xpath('//div[@class="articlebody-inside"]/p')
        phone = re.findall(r'to (.*)', info[1].text_content())[0]
        address = info[3].text_content().replace(
            'by mail: ', '') + ' ' + info[4].text_content()
        email = self.get_email(info[5])

        p.add_contact('voice', phone, 'legislature')
        p.add_contact('address', address, 'legislature')
        p.add_contact('email', email)

        return p
Example #43
0
    def scrape(self):
        councillor_seat_number = 1

        contact_page = self.lxmlize(CONTACT_URL)
        email = self.get_email(contact_page)

        page = self.lxmlize(COUNCIL_PAGE)
        urls = page.xpath('//a/@href[contains(., "members/")]')
        assert len(urls), 'No councillors found'
        for url in urls:
            page = self.lxmlize(url)
            role, name = page.xpath('//h1//text()')[0].split(' ', 1)
            photo_url = page.xpath('//div[@id="content"]//img/@src')[0]

            if role == 'Mayor':
                district = 'Richmond'
            else:
                district = 'Richmond (seat {})'.format(councillor_seat_number)
                councillor_seat_number += 1

            p = Person(primary_org='legislature',
                       name=name,
                       district=district,
                       role=role)
            p.image = photo_url
            p.add_source(COUNCIL_PAGE)
            p.add_source(CONTACT_URL)
            p.add_source(url)
            p.add_contact('email', email)  # same for all
            yield p
Example #44
0
    def scrape(self):
        councillor_seat_number = 1

        page = self.lxmlize(COUNCIL_PAGE)
        nodes = page.xpath('//div[@class="view-content"]/div')
        for node in nodes:
            fields = node.xpath('./div')
            role = fields[0].xpath('./div//text()')[0]
            name = fields[2].xpath('.//a//text()')[0].title().split(role)[-1].strip()
            if name == 'Vacant':
                continue

            if 'Ward' in role:
                district = role
                role = 'Councillor'
            else:
                if 'At Large' in role:
                    role = 'Councillor at Large'
                    district = "St. John's (seat {})".format(councillor_seat_number)
                    councillor_seat_number += 1
                else:
                    district = "St. John's"
            phone = fields[3].xpath('./div//text()')[0]
            email = self.get_email(fields[5])
            photo_url = node.xpath('.//img/@src')[0]

            p = Person(primary_org='legislature', name=name, district=district, role=role)
            p.add_source(COUNCIL_PAGE)
            p.add_contact('voice', phone, 'legislature')
            p.add_contact('email', email)
            p.image = photo_url
            yield p
Example #45
0
    def scrape(self):
        page = self.lxmlize(COUNCIL_PAGE)

        councillors = page.xpath('//table[@id="Table1table"]/tbody/tr')
        assert len(councillors), 'No councillors found'
        for i, councillor in enumerate(councillors):
            role_district = councillor.xpath('./td[2]/p/text()')[0].strip()
            if 'Mayor' in role_district:
                name = role_district.replace('Mayor and Regional Councillor', '')
                role = 'Mayor'
                district = 'Milton'
            else:
                name = councillor.xpath('./td[2]/p/text()')[1]
                role, district = re.split(r' (?=Ward)', role_district)
                if role == 'Town and Regional Councillor':
                    role = 'Regional Councillor'
                elif role == 'Town Councillor':
                    role = 'Councillor'

            p = Person(primary_org='legislature', name=name, district=district, role=role)
            p.add_source(COUNCIL_PAGE)

            p.image = councillor.xpath('./td[1]/p//img/@src')[0]

            numbers = councillor.xpath('./td[3]/p[2]/text()')
            for number in numbers:
                num_type, number = number.split(':')
                number = number.replace(', ext ', ' x').strip()
                p.add_contact(num_type, number, num_type)

            yield p
Example #46
0
    def scrape(self):
        self.user_agent = CUSTOM_USER_AGENT
        page = self.get(COUNCIL_PAGE)
        members = re.findall('/Members/YourMember/[^"]+', page.text)
        assert len(members), 'No members found'
        for member in members:
            detail_url = 'http://www.assembly.nl.ca%s' % member
            detail = self.lxmlize(detail_url, user_agent=CUSTOM_USER_AGENT)

            name = detail.xpath('//h1/text()')[0]
            district = re.sub(r' [\xa0–-] ', '—', detail.xpath('//h2/text()')[0])  # # n-dash, m-dash
            party = PARTIES[detail.xpath('//h3/text()')[0]]

            p = Person(primary_org='legislature', name=name, district=district, role='MHA', party=party)
            p.image = detail.xpath('//img[@class="img-responsive"]/@src')[0]

            contact = detail.xpath('//div[@class="col-md-12"]')[0]
            p.add_contact('email', self.get_email(contact))

            p.add_source(COUNCIL_PAGE)
            p.add_source(detail_url)

            for heading, _type in HEADING_TYPE.items():
                node = detail.xpath('//b[.="%s"]/../..' % heading)
                if node:
                    phone = self.get_phone(node[0], error=False)
                    if phone:
                        p.add_contact('voice', phone, _type)

            yield p
Example #47
0
    def scrape(self):
        page = self.lxmlize(COUNCIL_PAGE, 'utf-8')

        yield self.scrape_mayor(page)

        for tr in page.xpath('//tbody/tr'):
            if tr.xpath('./td[2]//text()')[0] != 'Vacant':
                district = tr.xpath('./td[1]/text()')[0]
                if 'Conseiller n' in district:
                    district = 'Greenfield Park'
                detail_url = tr.xpath('./td[2]/a/@href')[0]
                detail_page = self.lxmlize(detail_url, 'utf-8')

                name = detail_page.xpath('//h1/text()')[0]
                photo_node = detail_page.xpath('//img[contains(@alt, "{0}")]/@src'.format(name))
                if photo_node:
                    photo_url = photo_node[0]
                else:
                    photo_url = detail_page.xpath('//img[contains(@class, "droite")]/@src')[0]

                p = Person(primary_org='legislature', name=name, district=district, role='Conseiller')
                p.add_source(COUNCIL_PAGE)
                p.add_source(detail_url)
                p.image = photo_url
                yield p
Example #48
0
    def scrape(self):

        page = self.lxmlize(COUNCIL_PAGE)

        councillors = page.xpath('//p[@class="WSIndent"]/a')
        for councillor in councillors:
            district = re.findall(r'(Ward [0-9]{1,2})', councillor.text_content())
            if district:
                district = district[0]
                name = councillor.text_content().replace(district, '').strip()
                role = 'Councillor'
            else:
                district = 'Kawartha Lakes'
                name = councillor.text_content().replace('Mayor', '').strip()
                role = 'Mayor'

            url = councillor.attrib['href']
            page = self.lxmlize(url)
            email = self.get_email(page)
            image = page.xpath('//img[@class="image-right"]/@src')[0]

            p = Person(primary_org='legislature', name=name, district=district, role=role)
            p.add_source(COUNCIL_PAGE)
            p.add_source(url)
            p.add_contact('email', email)
            p.image = image
            yield p
Example #49
0
    def scrape(self):
        page = self.lxmlize(COUNCIL_PAGE)

        # it's all javascript rendered on the client... wow.
        js = page.xpath('string(//div[@class="inner_container"]/div/script[2])')  # allow string()
        districts = re.findall(r'arrayDistricts\[a.+"(.+)"', js)
        names = re.findall(r'arrayMembres\[a.+"(.+)"', js)
        urls = re.findall(r'arrayLiens\[a.+"(.+)"', js)
        # first item in list is mayor
        p = Person(primary_org='legislature', name=names[0], district='Gatineau', role='Maire')
        p.add_source(COUNCIL_PAGE)
        p.add_source(MAYOR_CONTACT_PAGE)
        email = '*****@*****.**'  # hardcoded
        p.add_contact('email', email)
        yield p

        for raw_district, name, url in list(zip(districts, names, urls))[1:]:
            if name == 'Vacant':
                continue

            profile_url = COUNCIL_PAGE + '/' + url.split('/')[-1]
            profile_page = self.lxmlize(profile_url)
            photo_url = profile_page.xpath('//img/@src')[0]
            district = 'District ' + re.search('\d+', raw_district).group(0)
            email = self.get_email(profile_page)
            p = Person(primary_org='legislature', name=name, district=district, role='Conseiller')
            p.add_source(COUNCIL_PAGE)
            p.add_source(profile_url)
            p.image = photo_url
            p.add_contact('email', email)
            yield p
Example #50
0
    def scrape(self):
        regional_councillor_seat_number = 1
        page = self.lxmlize(COUNCIL_PAGE)

        councillors = page.xpath('//a[@title="Mayor and Council::Meet Your Council"]/following-sibling::ul//@href')
        assert len(councillors), 'No councillors found'
        for councillor in councillors:
            node = self.lxmlize(councillor).xpath('//div[@id="printArea"]')[0]
            name = node.xpath('.//h1/text()')[0]

            if 'Mayor' in name:
                role = 'Mayor'
                district = 'Whitby'
                name = name.replace('Mayor ', '')
            else:
                role = node.xpath('.//h2/text()')[0]
                if 'Regional Councillor' in role:
                    district = 'Whitby (seat {})'.format(regional_councillor_seat_number)
                    regional_councillor_seat_number += 1
                else:
                    role, district = role.split(', ')
                    district = district.split(' (')[0]

            image = node.xpath('.//img/@src')[0]

            p = Person(primary_org='legislature', name=name, district=district, role=role)
            p.add_source(COUNCIL_PAGE)
            p.add_contact('voice', self.get_phone(node), 'legislature')
            p.add_contact('email', self.get_email(node))
            p.image = image

            yield p
Example #51
0
    def scrape(self):
        page = self.lxmlize(COUNCIL_PAGE, 'iso-8859-1')

        yield self.scrape_mayor()

        councillors = page.xpath('//div[@class="articlebody-inside"]//p[contains(text(),"-")]')
        for councillor in councillors:
            url = councillor.xpath('.//a')[0].attrib['href'].replace('../', '')
            page = self.lxmlize(url, 'iso-8859-1')

            name = page.xpath('//div[@class="articletitle"]/h1')[0].text_content().replace('Councillor', '').replace('Deputy Mayor', '')
            district = 'Ward {}'.format(re.sub(r'\D+', '', page.xpath('//div[@class="articlebody-inside"]/p')[0].text_content()))

            p = Person(primary_org='legislature', name=name, district=district, role='Councillor')
            p.add_source(COUNCIL_PAGE)
            p.add_source(url)

            photo_url_rel = page.xpath('//div[@class="articlebody-inside"]/p/img/@src')[0].replace('/..', '')
            p.image = urljoin(url, photo_url_rel)

            contacts = page.xpath('//div[@class="articlebody-inside"]/p')[1].text_content().replace('Biography', '').replace('Committees', '').split(':')
            for i, contact in enumerate(contacts):
                if i == 0 or not contact:
                    continue
                contact_type = re.findall(r'([A-Z][a-z]+)', contacts[i - 1])[0]
                if contact_type != 'Address':
                    contact = re.split(r'[A-Z]', contact)[0]
                contact_type = CONTACT_DETAIL_TYPE_MAP[contact_type]
                p.add_contact(contact_type, contact, '' if contact_type == 'email' else 'legislature')
            yield p
Example #52
0
    def scrape_mayor(self, div):
        name = div.xpath('.//a')[0].text_content()
        url = div.xpath('.//a/@href')[0]
        page = self.lxmlize(url)
        contact_url = page.xpath('//a[@title="Joindre le maire"]/@href')[0]
        contact_page = self.lxmlize(contact_url)

        p = Person(primary_org='legislature',
                   name=name,
                   district='Saint-Jean-sur-Richelieu',
                   role='Maire')
        p.add_source(COUNCIL_PAGE)
        p.add_source(url)
        p.add_source(contact_url)

        p.image = div.xpath('./preceding-sibling::td//img/@src')[-1]

        contacts = contact_page.xpath(
            '//div[@id="ctl00_PlaceHolderMain_ctl01_ctl01__ControlWrapper_RichHtmlField"]//div/font/text()'
        )
        address = ' '.join(contacts[:4])
        phone = contacts[-3].split(':')[1].strip().replace(' ', '-')
        fax = contacts[-2].split(':')[1].strip().replace(' ', '-')
        p.add_contact('address', address, 'legislature')
        p.add_contact('voice', phone, 'legislature')
        p.add_contact('fax', fax, 'legislature')
        # mayor's email is a form
        return p
Example #53
0
    def scrape(self):
        page = self.lxmlize(COUNCIL_PAGE)

        councillors = page.xpath('//table[@id="MLAs"]//tr')[1:]
        for councillor in councillors:
            if 'Vacant' not in councillor.xpath('./td')[0].text_content():
                name = councillor.xpath('./td')[0].text_content().split('. ', 1)[1]
                party = councillor.xpath('./td')[1].text
                district = councillor.xpath('./td')[2].text_content()
                url = councillor.xpath('./td[1]/a/@href')[0]
                page = self.lxmlize(url)

                p = Person(primary_org='legislature', name=name, district=district, role='MLA', party=party)
                p.add_source(COUNCIL_PAGE)
                p.add_source(url)
                p.image = page.xpath('//div[contains(@class, "mla-image-cell")]/img/@src')[0]

                contact = page.xpath('//div[@id="mla-contact"]/div[2]')[0]
                website = contact.xpath('./div[3]/div[3]/div[2]/a')
                if website:
                    p.add_link(website[0].text_content())

                p.add_contact('address', ' '.join(contact.xpath('.//div[@class="col-md-4"][2]/div//text()')[1:9]), 'constituency')
                phone_leg = contact.xpath('.//span[@id="MainContent_ContentBottom_Property6"]//text()')[0]
                phone_const = contact.xpath('.//div[@class="col-md-4"]/div[4]/span/span/text()')[0]
                p.add_contact('voice', phone_leg, 'legislature', area_code=306)
                p.add_contact('voice', phone_const, 'constituency', area_code=306)
                email = self.get_email(contact)
                p.add_contact('email', email)

                yield p
Example #54
0
    def scrape(self):
        page = self.lxmlize(COUNCIL_PAGE, 'iso-8859-1')

        councillors = page.xpath('//div[@id="PageContent"]/table/tbody/tr/td')
        assert len(councillors), 'No councillors found'
        for councillor in councillors:
            if not councillor.text_content().strip():
                continue
            if councillor == councillors[0]:
                district = 'Kirkland'
                role = 'Maire'
            else:
                district = councillor.xpath('.//h2')[0].text_content()
                district = re.search('- (.+)', district).group(1).strip()
                district = district.replace(' Ouest',
                                            ' ouest').replace(' Est', ' est')
                role = 'Conseiller'

            name = councillor.xpath('.//strong/text()')[0]

            phone = councillor.xpath(
                './/div[contains(text(), "#")]/text()')[0].replace(
                    'T ', '').replace(' ', '-').replace(',-#-', ' x')
            email = self.get_email(councillor)

            p = Person(primary_org='legislature',
                       name=name,
                       district=district,
                       role=role)
            p.add_source(COUNCIL_PAGE)
            p.add_contact('voice', phone, 'legislature')
            p.add_contact('email', email)
            p.image = councillor.xpath('.//img/@src')[0]
            yield p
Example #55
0
    def scrape(self):
        page = self.lxmlize(COUNCIL_PAGE)

        councillors = page.xpath(
            '//div[contains(@class, "view-people")]//div[contains(@class, "views-row")]'
        )
        assert len(councillors), 'No councillors found'
        for councillor in councillors:
            name = councillor.xpath(
                './/div[@property="dc:title"]')[0].text_content()
            role_and_district = councillor.xpath(
                './/div[contains(@class, "field-name-field-sub-title")]//p'
            )[-2].text_content().replace('\xa0', ' ')

            if role_and_district == 'Mayor':
                district = 'Fredericton'
                role = 'Mayor'
            else:
                district = role_and_district.split(', ', 1)[1]
                role = 'Councillor'

            url = councillor.xpath('.//@href')[0]
            page = self.lxmlize(url)

            p = Person(primary_org='legislature',
                       name=name,
                       district=district,
                       role=role)
            p.image = councillor.xpath('.//img[@typeof="foaf:Image"]/@src')[0]
            p.add_contact('email', self.get_email(page))
            p.add_contact('voice', self.get_phone(page, area_codes=[506]),
                          'legislature')
            p.add_source(COUNCIL_PAGE)
            p.add_source(url)
            yield p
Example #56
0
    def scrape(self):
        page = self.lxmlize(COUNCIL_PAGE)

        councillors = page.xpath('//div[@class="block text"]')
        assert len(councillors), 'No councillors found'
        for i, councillor in enumerate(councillors):
            name = councillor.xpath(
                './/div[@class="content-writable"]//strong/text()')[0]
            district = councillor.xpath('.//h2/text()')[0]

            if 'Maire' in district:
                district = 'Sainte-Anne-de-Bellevue'
                role = 'Maire'
            else:
                district = 'District {}'.format(re.search(r'\d+', district)[0])
                role = 'Conseiller'

            p = Person(primary_org='legislature',
                       name=name,
                       district=district,
                       role=role)
            p.add_source(COUNCIL_PAGE)

            p.image = councillor.xpath('.//@src')[0]
            p.add_contact('email', self.get_email(councillor))
            yield p
Example #57
0
    def councillor_data(self, url, name, ward):
        page = self.lxmlize(url)
        # sadly, email is a form on a separate page
        photo_url_rel = page.xpath(
            '//div[contains(@id, "contentcontainer")]//img/@src')[0]
        photo_url = urljoin(url, photo_url_rel)

        m = Person(primary_org='legislature',
                   name=name,
                   district=ward,
                   role='Councillor')
        m.add_source(COUNCIL_PAGE)
        m.add_source(url)

        phone = self.get_phone(page.xpath('//div[@id="contentcontainer"]')[0],
                               area_codes=[306],
                               error=False)
        if phone:
            m.add_contact('voice', phone, 'legislature')
        else:
            phone = self.get_phone(
                page.xpath('//div[@id="lowercontentcontainer"]')[0],
                area_codes=[306],
                error=False)
            if phone:
                m.add_contact('voice', phone, 'legislature')

        m.image = photo_url
        yield m