Exemple #1
0
    def scrape(self):
        mayor_page = self.lxmlize(MAYOR_PAGE)
        contact_page = self.lxmlize(CONTACT_PAGE)

        name = mayor_page.xpath('//span/text()[contains(., "maire")]')[0].split(', ', 1)[0]
        p = Person(primary_org='legislature', name=name, district='Saguenay', role='Maire')
        p.add_source(MAYOR_PAGE)
        p.add_source(CONTACT_PAGE)
        node = contact_page.xpath('//h2[contains(., "Coordonnées du cabinet")]/following-sibling::p')[1]
        p.add_contact('voice', self.get_phone(node, area_codes=[418]), 'legislature')
        p.add_contact('email', self.get_email(node))
        yield p

        page = self.lxmlize(COUNCIL_PAGE)
        councillors = page.xpath('//div[contains(./h3, "District")]')
        assert len(councillors), 'No councillors found'
        for councillor in councillors:
            district = councillor.xpath('./h3/text()')[0].replace('#', '')
            name = councillor.xpath('.//p/text()')[0]

            p = Person(primary_org='legislature', name=name, district=district, role='Conseiller')
            p.add_source(COUNCIL_PAGE)
            p.add_contact('voice', self.get_phone(councillor), 'legislature')
            p.add_contact('email', self.get_email(councillor))
            yield p
Exemple #2
0
    def scrape(self):
        page = self.lxmlize(COUNCIL_PAGE, 'utf-8')
        councillors = page.xpath('//div[@class="member-box member-box--gray"]')
        assert len(councillors), 'No councillors found'
        for councillor_elem in councillors:
            name = councillor_elem.xpath('.//div[@class="fiche__name"]/text()')[0]
            district = councillor_elem.xpath('.//div[@class="fiche__category"]/text()')[0]
            phone = councillor_elem.xpath('.//div[@class="fiche__social"]/span/text()')[0].split('T')[1]
            email_mailto = councillor_elem.xpath('.//div[@class="fiche__social"]/a[contains(@href, "mailto")]/@href')
            photo_url = councillor_elem.xpath('.//img')[0].attrib['src']

            p = Person(primary_org='legislature', name=name, district=district, role='Conseiller',
                       image=photo_url)
            p.add_source(COUNCIL_PAGE)
            p.add_contact('voice', phone, 'legislature')
            if email_mailto:
                email = email_mailto[0].split('mailto:')[1]
                p.add_contact('email', email)
            yield p

        mayor_elem = page.xpath('//div[@class="member-box member-box--main"]')[0]
        name = mayor_elem.xpath('.//div[@class="fiche__name"]/text()')[0]
        phone = mayor_elem.xpath('.//div[@class="fiche__social"]/span/text()')[0].split('T')[1]
        email_mailto = mayor_elem.xpath('.//div[@class="fiche__social"]/a[contains(@href, "mailto")]/@href')
        photo_url = councillor_elem.xpath('.//img')[0].attrib['src']
        p = Person(primary_org='legislature', name=name, district='Terrebonne', role='Maire',
                   image=photo_url)
        p.add_source(COUNCIL_PAGE)
        p.add_contact('voice', phone, 'legislature')
        if email_mailto:
            email = email_mailto[0].split('mailto:')[1]
            p.add_contact('email', email)
        yield p
Exemple #3
0
    def scrape(self):
        page = self.lxmlize(COUNCIL_PAGE)

        mayor = page.xpath('//div[./div/h3[contains(text(), "Maire")]]/p/text()')
        m_name = mayor[0].strip().split('.')[1].strip()
        m_phone = mayor[1].strip().split(':')[1].strip()

        m = Person(primary_org='legislature', name=m_name, district='Saguenay', role='Maire')
        m.add_source(COUNCIL_PAGE)
        m.add_contact('voice', m_phone, 'legislature')

        yield m

        councillors = page.xpath('//div[./div/h3[contains(text(), "District")]]')
        for councillor in councillors:
            district = councillor.xpath('./div/h3')[0].text_content().replace('#', '')
            name = councillor.xpath('.//p/text()')[0].encode('latin-1').decode('utf-8')
            name = name.replace('M. ', '').replace('Mme ', '').strip()
            phone = councillor.xpath('.//p/text()')[1].split(':')[1].strip().replace(' ', '-')
            email = self.get_email(councillor)

            p = Person(primary_org='legislature', name=name, district=district, role='Conseiller')
            p.add_source(COUNCIL_PAGE)

            p.add_contact('voice', phone, 'legislature')
            p.add_contact('email', email)
            yield p
Exemple #4
0
    def scrape(self):
        page = self.lxmlize(COUNCIL_PAGE)
        councillors = page.xpath(
            '//div[@id = "block-districtdistrictindex"]/ul/li')[1:]

        assert len(councillors), 'No councillors found'
        for councillor in councillors:
            photo_div = councillor.xpath('./a/div[1]')[0]
            info_div = councillor.xpath('./a/div[2]')[0]
            district = re.sub(r'\s*[–—-]\s*', '—',
                              '—'.join(info_div.xpath('./p/text()')))
            # FIXME: we special-case one malformed district name. If you're editing this file,
            # try removing these lines
            if district.startswith("District 16 "):
                district = district[len("District 16 "):]

            name = info_div.xpath('./strong/p/text()')[0].replace(
                'Councillor ', '').replace('Deputy Mayor ', '')

            if name != 'To be determined':
                photo = photo_div.xpath('.//img/@src')[0]
                url = councillor.xpath('./a/@href')[0]
                councillor_page = self.lxmlize(url)

                contact_node = councillor_page.xpath(
                    '//div[@id = "block-districtdistrictprofile"]')[0]
                phone = self.get_phone(contact_node, area_codes=[902])
                email = self.get_email(contact_node)

                p = Person(primary_org='legislature',
                           name=name,
                           district=district,
                           role='Councillor')
                p.add_source(COUNCIL_PAGE)
                p.add_source(url)
                p.add_contact('voice', phone, 'legislature')
                p.add_contact('email', email)
                p.image = photo
                yield p

        mayor_page = self.lxmlize(MAYOR_PAGE, 'iso-8859-1')
        name = ' '.join(mayor_page.xpath('//h1/text()')).replace('Mayor',
                                                                 '').strip()
        contact_div = mayor_page.xpath(
            '//aside[contains(@class, "layout-sidebar-second")]/section/div[1]'
        )[0]
        phone = self.get_phone(contact_div.xpath('./p[2]')[0])
        email = self.get_email(contact_div.xpath('./p[2]')[0])

        p = Person(primary_org='legislature',
                   name=name,
                   district='Halifax',
                   role='Mayor')
        p.add_source(MAYOR_PAGE)
        p.add_contact('email', email)
        p.add_contact('voice', phone, 'legislature')
        yield p
Exemple #5
0
    def scrape(self):
        page = self.lxmlize(COUNCIL_PAGE)
        councillors = page.xpath('//div[./h2/a[contains(@href, "/District")]]')

        for councillor in councillors:
            district = re.sub(
                r' ?[–—-] ?', '—', '—'.join(
                    filter(None,
                           (text.replace(',', '').strip()
                            for text in councillor.xpath('./p/text()')))))

            name_elem = councillor.xpath('./p/strong/text()')[0]
            if 'Councillor' in name_elem:
                name = name_elem.strip()[len('Councillor '):]
            else:
                name = name_elem

            if name != 'To be determined':
                photo = councillor.xpath('./p/a/img/@src')[0]

                councillor_page = self.lxmlize(
                    councillor.xpath('./h2/a/@href')[0])
                contact_page_url = councillor_page.xpath(
                    '//li/a[contains(@href, "contact")]/@href')[0]
                contact_page = self.lxmlize(contact_page_url)
                contact_node = contact_page.xpath(
                    '//div[./h1[contains(text(), "Contact")]]')[0]

                phone = self.get_phone(contact_node, area_codes=[902])
                email = self.get_email(contact_node)

                p = Person(primary_org='legislature',
                           name=name,
                           district=district,
                           role='Councillor')
                p.add_source(COUNCIL_PAGE)
                p.add_source(contact_page_url)
                p.add_contact('voice', phone, 'legislature')
                p.add_contact('email', email)
                p.image = photo
                yield p

        mayor_page = self.lxmlize(MAYOR_PAGE, 'iso-8859-1')
        name = ' '.join(mayor_page.xpath(
            '//h2[contains(., "Bio")]/text()')).strip()[:-len(' Bio')]
        contact_page = self.lxmlize(MAYOR_CONTACT_URL, 'iso-8859-1')
        email = self.get_email(contact_page)

        p = Person(primary_org='legislature',
                   name=name,
                   district='Halifax',
                   role='Mayor')
        p.add_source(MAYOR_PAGE)
        p.add_source(MAYOR_CONTACT_URL)
        p.add_contact('email', email)
        yield p
Exemple #6
0
    def scrape(self):
        page = self.lxmlize(COUNCIL_PAGE)

        node = page.xpath('//td[@rowspan="2"]')[0]
        name = node.xpath('.//h3/strong/text()')[0]
        image = node.xpath('.//@src')[0]
        voice = self.get_phone(node)
        url = node.xpath('.//a[contains(., "Visit")]/@href')[0]

        p = Person(primary_org='legislature',
                   name=name,
                   district='Caledon',
                   role='Mayor')
        p.add_source(COUNCIL_PAGE)
        p.add_source(url)

        p.add_contact('voice', voice, 'legislature')
        p.add_contact('email', self.get_email(self.lxmlize(url)))
        p.image = image

        yield p

        councillors = page.xpath('//div[@id="printAreaContent"]//table[2]//td')
        councillors = councillors[:12] + councillors[16:]
        assert len(councillors), 'No councillors found'
        for i in range(len(councillors) // 3):
            i = i // 4 * 12 + i % 4
            district, role = councillors[i].xpath('.//h3/text()')
            name = councillors[i + 8].xpath('.//strong/text()')[0]
            voice = self.get_phone(councillors[i + 8])
            url = councillors[i +
                              8].xpath('.//a[contains(., "Visit")]/@href')[0]

            if 'photo to come' in councillors[i + 4].text_content():
                image = None
            else:
                image = councillors[i + 4].xpath('.//@src')[0]

            district = district.replace('\xa0', ' ')
            if ' and ' in district:
                district = district.replace('Ward ', 'Wards ')

            p = Person(primary_org='legislature',
                       name=name,
                       district=district,
                       role=role)
            p.add_source(COUNCIL_PAGE)
            p.add_source(url)

            p.add_contact('voice', voice, 'legislature')
            p.add_contact('email', self.get_email(self.lxmlize(url)))
            if image:
                p.image = image

            yield p
Exemple #7
0
    def scrape(self):
        root = self.lxmlize(COUNCIL_PAGE)
        everyone = root.xpath('//span[@class="Title"]')
        mayornode = everyone[0]
        mayor = {}
        spantext = ' '.join(mayornode.xpath('.//text()'))
        mayor['name'] = re.search(r'[^(]+', spantext).group(0).strip()
        mayor['photo_url'] = urljoin(COUNCIL_PAGE,
                                     mayornode.xpath('img/@src')[0])
        mayor['email'] = mayornode.xpath('following::a[1]/text()')[0]

        m = Person(primary_org='legislature',
                   name=mayor['name'],
                   district='Charlottetown',
                   role='Mayor')
        m.add_source(COUNCIL_PAGE)
        m.add_contact('email', mayor['email'])
        m.image = mayor['photo_url']

        yield m

        councillors = root.xpath('//span[@class="Title"]')[1:]
        assert len(councillors), 'No councillors found'
        for span in councillors:
            spantext = ' '.join(span.xpath('.//text()'))
            header = spantext.replace('\u2013', '-').replace('\x96',
                                                             '-').split('-')
            if len(header) != 2:
                continue

            name = header[0].strip()
            name = name.replace('Councillor', '')
            name = re.sub(r'\(.+?\)', '', name)
            name = ' '.join(name.split())

            district_id = ' '.join(header[1].split()[:2])

            # needed a wacky xpath to deal with ward 8
            photo = span.xpath('preceding::hr[1]/following::img[1]/@src')
            photo_url = urljoin(COUNCIL_PAGE, photo[0])

            email = span.xpath(
                'string(following::a[1]/text())')  # can be empty

            p = Person(primary_org='legislature',
                       name=name,
                       district=district_id,
                       role='Councillor')
            p.add_source(COUNCIL_PAGE)
            if email:
                p.add_contact('email', email)
            p.image = photo_url

            yield p
Exemple #8
0
    def scrape(self):
        page = self.lxmlize(COUNCIL_PAGE)

        councillors = page.xpath('//h1[@class="title"]')
        for councillor in councillors:
            if ',' not in councillor.text_content():
                continue
            name, district = councillor.text_content().split(',')
            name = name.strip()
            if 'Mayor' in district:
                p = Person(primary_org='legislature',
                           name=name,
                           district='Beaconsfield',
                           role='Maire')
                p.add_source(COUNCIL_PAGE)
                p.image = councillor.xpath(
                    './parent::div/parent::div/p//img/@src')[0]
                phone = councillor.xpath(
                    './/parent::div/following-sibling::div[contains(text(), "514")]/text()'
                )[0]
                phone = phone.split(':')[1].strip().replace(' ', '-')
                p.add_contact('voice', phone, 'legislature')
                script = councillor.xpath(
                    './/parent::div/following-sibling::div/script'
                )[0].text_content()
                p.add_contact('email', get_email(script))
                yield p
                continue

            district = district.split('-')[1].strip()
            p = Person(primary_org='legislature',
                       name=name,
                       district=district,
                       role='Conseiller')
            p.add_source(COUNCIL_PAGE)

            p.image = councillor.xpath(
                './parent::div/parent::div/p//img/@src')[0]

            phone = councillor.xpath(
                './/parent::div/following-sibling::p[contains(text(), "514")]/text()'
            )
            if phone:
                phone = phone[0]
                phone = phone.split(':')[1].strip().replace(' ', '-')
                p.add_contact('voice', phone, 'legislature')
            script = councillor.xpath(
                './/parent::div/following-sibling::p/script')[0].text_content(
                )
            p.add_contact('email', get_email(script))
            yield p
Exemple #9
0
    def scrape(self):
        page = self.lxmlize(COUNCIL_PAGE)
        nodes = page.xpath('//div[contains(@class,"cocis-has-caption")]')[1:]
        for node in nodes:
            url = urljoin(COUNCIL_PAGE, node.xpath('.//a[1]/@href')[0])
            name = node.xpath('.//a//text()')[0]
            ward = ' '.join(node.xpath('.//strong//text()')[0].split()[:-1])
            yield self.councillor_data(url, name, ward)

        mayor_node = page.xpath(
            '//div[contains(@class, "cocis-image-panel")]')[0]
        photo_url = urljoin(COUNCIL_PAGE, mayor_node.xpath('.//img/@src')[0])
        name = mayor_node.xpath('.//a//text()')[0]
        mayor_page = self.lxmlize(MAYOR_PAGE)
        # Email behind mailhide
        # email = self.get_email(mayor_page)
        phone = self.get_phone(mayor_page, area_codes=[403])
        m = Person(primary_org='legislature',
                   name=name,
                   district='Calgary',
                   role='Mayor')
        m.add_source(COUNCIL_PAGE)
        m.add_source(MAYOR_PAGE)
        m.add_contact('voice', phone, 'legislature')
        m.image = photo_url
        yield m
Exemple #10
0
    def scrape(self):
        councillor_seat_number = 1

        page = self.lxmlize(COUNCIL_PAGE)
        councillors = page.xpath(
            '//div[@id="content"]//table//tr[position() mod 2 = 1]')
        assert len(councillors), 'No councillors found'
        for councillor in councillors:
            text = councillor.xpath('.//strong/text()')[0]
            if 'Deputy Warden' in text:
                role = 'Deputy Warden'
                name = text.replace('Deputy Warden', '')
                district = 'Lambton'
            elif 'Warden' in text:
                role = 'Warden'
                name = text.replace('Warden', '')
                district = 'Lambton'
            else:
                role = 'Councillor'
                name = text
                district = 'Lambton (seat {})'.format(councillor_seat_number)
                councillor_seat_number += 1

            p = Person(primary_org='legislature',
                       name=name,
                       district=district,
                       role=role)
            p.add_source(COUNCIL_PAGE)

            p.image = councillor.xpath('.//img/@src')[0]
            p.add_contact('email', self.get_email(councillor))

            yield p
Exemple #11
0
 def scrape(self):
     csv_text = self.get(self.get_csv_url()).text
     cr = csv.DictReader(StringIO(csv_text))
     for mla in cr:
         name = '{} {} {}'.format(mla['MLA First Name'],
                                  mla['MLA Middle Names'],
                                  mla['MLA Last Name'])
         if name.strip() == '':
             continue
         party = get_party(mla['Caucus'])
         name_without_status = name.split(',')[0]
         detail_url = ('http://www.assembly.ab.ca/net/index.aspx?'
                       'p=mla_contact&rnumber={0}&leg=29'.format(
                           mla['Riding Number']))
         detail_page = self.lxmlize(detail_url)
         photo_url = detail_page.xpath('//img[@class="MemPhoto"]/@src')[0]
         p = Person(
             primary_org='legislature',
             name=name_without_status,
             district=mla['Riding Name'],
             role='MLA',
             party=party,
             image=photo_url,
         )
         p.add_source(COUNCIL_PAGE)
         p.add_source(detail_url)
         if mla['Email']:
             p.add_contact('email', mla['Email'])
         elif mla.get('MLA Email'):
             p.add_contact('email', mla['MLA Email'])
         if mla['Phone Number']:
             p.add_contact('voice', mla['Phone Number'], 'legislature')
         yield p
Exemple #12
0
    def scrape(self):
        page = self.lxmlize(COUNCIL_PAGE,
                            user_agent=CUSTOM_USER_AGENT,
                            encoding='windows-1252')

        councillors = page.xpath('//table[@width="800"]/tr')
        assert len(councillors), 'No councillors found'
        for councillor in councillors:
            if councillor == councillors[0]:
                name = councillor.xpath('.//strong/text()')[0].replace(
                    'Monsieur', '').replace('Madame', '').strip()
                role = 'Maire'
                district = 'Mercier'
            else:
                name = councillor.xpath('.//strong/text()')[0].replace(
                    'Monsieur', '').replace('Madame', '').strip()
                role = 'Conseiller'
                district = 'District {}'.format(
                    re.search(r'(\d)',
                              councillor.xpath('.//text()')[3]).group(1))

            email = self.get_email(councillor)

            p = Person(primary_org='legislature',
                       name=name,
                       district=district,
                       role=role)
            p.add_source(COUNCIL_PAGE)
            p.add_contact('email', email)
            yield p
Exemple #13
0
    def scrape(self):
        page = self.lxmlize(COUNCIL_PAGE)

        councillors = page.xpath('//div[@class="field-item even"]//tr')
        assert len(councillors), 'No councillors found'
        for councillor in councillors:
            district = councillor.xpath('./td[1]//strong/text()')[0].replace(
                'no. ', '')
            role = 'Conseiller'
            if 'Maire' in district:
                district = 'Senneville'
                role = 'Maire'
            name = councillor.xpath('./td[2]//p//text()')[0].title()
            email = self.get_email(councillor)
            p = Person(primary_org='legislature',
                       name=name,
                       district=district,
                       role=role)
            p.add_source(COUNCIL_PAGE)
            try:
                p.image = councillor.xpath('.//img/@src')[0]
            except IndexError:
                pass
            p.add_contact('email', email)
            yield p
Exemple #14
0
    def scrape(self):
        page = self.lxmlize(COUNCIL_PAGE)
        councillors = page.xpath('//table//td[*]')

        assert len(councillors), 'No councillors found'
        for councillor in councillors:
            district, role, name = councillor.xpath('./p[1]/text()')
            role = role.strip()

            if district == 'City of Oshawa':
                district = 'Oshawa'

            if role == 'City Councillor':
                role = 'Councillor'
            elif role == 'Regional & City Councillor':
                role = 'Regional Councillor'

            photo_url = councillor.xpath('./p/img/@src')[0]
            phone = self.get_phone(
                councillor.xpath('./p[contains(.//text(), "Phone")]')[0],
                area_codes=[905])

            p = Person(primary_org='legislature',
                       name=name,
                       district=district,
                       role=role,
                       image=photo_url)
            p.add_source(COUNCIL_PAGE)
            p.add_contact('voice', phone, 'legislature')
            p.add_contact('email', self.get_email(councillor))
            yield p
Exemple #15
0
    def scrape(self):
        regional_councillor_seat_number = 1
        page = self.lxmlize(COUNCIL_PAGE)

        yield self.scrape_mayor(page)

        councillors = page.xpath('//h3[contains(text(), "Councillors")]/following-sibling::p')[:-1]
        assert len(councillors), 'No councillors found'
        for councillor_node in councillors:
            text = councillor_node.xpath('./strong/text()')
            if not text or 'Vacant' in text:
                continue

            name, role_district = text
            name = name.rstrip(',')

            if 'Regional Councillor' in role_district:
                role = role_district
                district = 'Whitby (seat {})'.format(regional_councillor_seat_number)
                regional_councillor_seat_number += 1
            else:
                role, district = role_district.strip().split(', ')
                district = district.split(' (')[0]

            email = self.get_email(councillor_node)
            image = councillor_node.xpath('./img/@src')[0]
            p = Person(primary_org='legislature', name=name, district=district, role=role, image=image)
            p.add_source(COUNCIL_PAGE)
            p.add_contact('email', email)
            yield p
Exemple #16
0
    def scrape(self):
        regional_councillor_seat_number = 1

        page = self.lxmlize(COUNCIL_PAGE)

        councillors = page.xpath('//center/center//a')
        for councillor in councillors:
            name = councillor.text_content().strip()
            url = councillor.attrib['href']
            page = self.lxmlize(url)
            header = page.xpath(
                '//div[@class="sectionheading"]')[0].text_content()
            if header == 'Mayor of Richmond Hill':
                district = 'Richmond Hill'
                role = 'Mayor'
            else:
                district = re.findall(r',(.*)-', header)
                if district:
                    district = district[0].strip()
                else:
                    district = 'Richmond Hill (seat {})'.format(
                        regional_councillor_seat_number)
                    regional_councillor_seat_number += 1

                role = 'Regional Councillor' if 'Regional' in header else 'Councillor'

            info = page.xpath(
                '//table[@cellpadding>0]/tbody/tr/td[last()]|//table[not(@cellpadding)]/tbody/tr/td[last()]'
            )
            info = info[0].text_content().replace(' - office:', ':')

            address = re.findall(
                r'(?<=Town of Richmond Hill)(.*(?=Telephone:)|(?=Telephone))',
                info)[0]
            address = re.sub(r'([a-z])([A-Z])', r'\1 \2', address)
            # I expected to be able to do '(.*)(?=\sTelephone|Telephone|Fax)', but nope.
            phone = re.findall(
                r'(?<=Telephone:) ((.*) (?=Telephone)|(.*)(?=Telephone)|(.*)(?=Fax))',
                info)[0][0].replace('(',
                                    '').replace(') ',
                                                '-').replace(', ext. ', ' x')
            fax = re.findall(r'(?<=Fax:) (.*)(?=E-mail)', info)[0].replace(
                ' ', '').replace('(', '').replace(')', '-')
            email = self.get_email(page)

            p = Person(primary_org='legislature',
                       name=name,
                       district=district,
                       role=role)
            p.add_source(COUNCIL_PAGE)
            p.add_source(url)
            p.add_contact('address', address, 'legislature')
            p.add_contact('voice', phone, 'legislature')
            p.add_contact('fax', fax, 'legislature')
            p.add_contact('email', email)
            p.image = page.xpath(
                '//img[contains(@alt, "{}")]/@src'.format(name))[0]
            if 'Website' in info:
                p.add_link(re.findall(r'www\..*\.[a-z]+', info)[0])
            yield p
Exemple #17
0
    def scrape_councillor(self, url, district):
        infos_page = self.lxmlize(url)
        infos = infos_page.xpath('//div[@class="item-page"]')[0]

        name = ' '.join(infos.xpath('p[2]/text()')[0].split(' ')[1:3])
        lname = name.lower()
        email = lname.split(' ')[0][0] + lname.split(
            ' ')[1] + '@langleycity.ca'
        photo_url = infos.xpath('p[1]/img/@src')[0]

        p = Person(primary_org='legislature',
                   name=name,
                   district=district,
                   role='Councillor',
                   image=photo_url)
        p.add_source(COUNCIL_PAGE)
        p.add_source(url)
        p.add_contact('email', email)

        personal_infos = infos.xpath('p[last()]/text()')

        if 'Residence' in personal_infos[0]:
            phone = re.findall(r'(Phone|Res)(:?) (.*)',
                               '\n'.join(personal_infos))[0][2]
            address = re.findall(r'Address: (.*) (Phone|Res)',
                                 ' '.join(personal_infos))[0][0]
            p.add_contact('address', address, 'residence')
            p.add_contact('voice', phone, 'residence')

        return p
Exemple #18
0
    def scrape(self):
        page = self.lxmlize(COUNCIL_PAGE)

        councillors = page.xpath(
            '//div[contains(@class, "councillorwrapper")]')
        assert len(councillors), 'No councillors found'
        for index, councillor in enumerate(councillors):
            name = councillor.xpath('.//h4/text()')[0]
            district = councillor.xpath('.//h4/span/text()')[0].strip()
            role = 'Councillor'
            email = None

            if not district and index == 0:
                district = 'Calgary'
                role = 'Mayor'
                email = '*****@*****.**'

            p = Person(primary_org='legislature',
                       name=name,
                       district=district,
                       role=role)
            p.image = councillor.xpath('.//@src')[0]
            if email:
                p.add_contact('email', email)
            p.add_source(COUNCIL_PAGE)
            yield p
Exemple #19
0
    def scrape(self):
        page = self.lxmlize(COUNCIL_PAGE, 'utf-8')
        councillors = page.xpath('//div[contains(@class, "member-box member-box--")]')
        assert len(councillors), 'No councillors found'
        for councillor in councillors:
            name = councillor.xpath('.//div[@class="fiche__name"]/text()')[0]
            phone = councillor.xpath('.//div[@class="fiche__social"]/span/text()')[0].split('T')[1]
            email_mailto = councillor.xpath('.//div[@class="fiche__social"]/a[contains(@href, "mailto")]/@href')
            photo_url = councillor.xpath('.//img')[0].attrib['src']

            page = self.lxmlize(councillor.xpath('.//a[@class="member-box__calltoaction"]/@href')[0])
            district = page.xpath('.//div[@class="fiche__category"]/text()')[0]

            if district == 'Maire':
                district = 'Terrebonne'
                role = 'Maire'
            else:
                district = 'District {}'.format(district)
                role = 'Conseiller'

            p = Person(primary_org='legislature', name=name, district=district, role=role, image=photo_url)
            p.add_source(COUNCIL_PAGE)
            p.add_contact('voice', phone, 'legislature')
            if email_mailto:
                email = email_mailto[0].split('mailto:')[1]
                p.add_contact('email', email)
            yield p
Exemple #20
0
    def scrape(self):
        page = self.lxmlize(COUNCIL_PAGE, 'utf-8')
        councillors = page.xpath('//td[@width="105"]')
        assert len(councillors), 'No councillors found'
        for node in councillors:
            url = urljoin(COUNCIL_PAGE, node.xpath('.//a/@href')[0])
            ward = re.search('([A-Z].+) Ward',
                             node.xpath('.//a//text()')[0]).group(1)
            ward = ward.replace(' – ', '—').replace(
                ' - ', '—')  # n-dash, m-dash, hyphen, m-dash
            ward = ward.replace('St. Norbert',
                                'St Norbert')  # to match ocd-division-ids
            name = ' '.join(node.xpath('.//span[@class="k80B"][1]/text()'))
            yield self.councillor_data(url, name, ward)

        mayor_node = page.xpath('//td[@width="315"]')[0]
        mayor_name = mayor_node.xpath('./a//text()')[0][len('Mayor '):]
        mayor_photo_url = mayor_node.xpath('./img/@src')[0]
        m = Person(primary_org='legislature',
                   name=mayor_name,
                   district='Winnipeg',
                   role='Mayor')
        m.add_source(COUNCIL_PAGE)
        # @see http://www.winnipeg.ca/interhom/mayor/MayorForm.asp?Recipient=CLK-MayorWebMail
        m.add_contact('email', '*****@*****.**')  # hardcoded
        m.image = mayor_photo_url
        yield m
Exemple #21
0
    def scrape_mayor(self, url):
        infos_page = self.lxmlize(url)
        infos = infos_page.xpath('//div[@class="item-page"]')[0]

        name = ' '.join(infos.xpath('p[2]/text()')[0].split(' ')[2:4])
        lname = name.lower()
        email = lname.split(' ')[0][0] + lname.split(
            ' ')[1] + '@langleycity.ca'
        photo_url = infos.xpath('p[1]/img/@src')[0]

        p = Person(primary_org='legislature',
                   name=name,
                   district='Langley',
                   role='Mayor',
                   image=photo_url)
        p.add_source(COUNCIL_PAGE)
        p.add_source(url)
        p.add_contact('email', email)

        personal_infos = infos.xpath('p[last()]/text()')

        phone = re.findall(r'Phone(:?) (.*)', '\n'.join(personal_infos))[0][1]
        address = re.findall(r'Address: (.*) Phone',
                             ' '.join(personal_infos))[0]
        p.add_contact('address', address, 'office')
        p.add_contact('voice', phone, 'office')

        return p
Exemple #22
0
    def scrape(self):
        page = self.lxmlize(COUNCIL_PAGE)

        councillors = page.xpath('//table[@id="Table1table"]/tbody/tr')
        assert len(councillors), 'No councillors found'
        for councillor in councillors:
            name = councillor.xpath('./td[2]/p/text()')[1]
            role = councillor.xpath('./td[2]/p/text()')[0].strip()
            if role == 'Mayor and Regional Councillor':
                role = 'Mayor'
            elif role == 'Local & Regional Councillor':
                role = 'Regional Councillor'
            elif role == 'Local Councillor':
                role = 'Councillor'
            if len(councillor.xpath('./td[2]/p/text()')) < 3:
                district = 'Milton'
            else:
                district = councillor.xpath('./td[2]/p/text()')[2]

            p = Person(primary_org='legislature', name=name, district=district, role=role)
            p.add_source(COUNCIL_PAGE)

            p.image = councillor.xpath('./td[1]/p//img/@src')[0]

            if councillor == councillors[0]:
                address = ', '.join(councillor.xpath('./td[3]/p[1]/text()')).replace('Email:', '').strip()
                p.add_contact('address', address, 'legislature')

            numbers = councillor.xpath('./td[3]/p[2]/text()')
            for number in numbers:
                num_type, number = number.split(':')
                number = number.replace(', ext ', ' x').strip()
                p.add_contact(num_type, number, num_type)

            yield p
Exemple #23
0
    def scrape(self):
        councillor_seat_number = 1

        contact_page = self.lxmlize(CONTACT_URL)
        email = self.get_email(contact_page)

        page = self.lxmlize(COUNCIL_PAGE)
        urls = page.xpath('//a/@href[contains(., "members/")]')
        assert len(urls), 'No councillors found'
        for url in urls:
            page = self.lxmlize(url)
            role, name = page.xpath('//h1//text()')[0].split(' ', 1)
            photo_url = page.xpath('//div[@id="content"]//img/@src')[0]

            if role == 'Mayor':
                district = 'Richmond'
            else:
                district = 'Richmond (seat {})'.format(councillor_seat_number)
                councillor_seat_number += 1

            p = Person(primary_org='legislature',
                       name=name,
                       district=district,
                       role=role)
            p.image = photo_url
            p.add_source(COUNCIL_PAGE)
            p.add_source(CONTACT_URL)
            p.add_source(url)
            p.add_contact('email', email)  # same for all
            yield p
Exemple #24
0
    def scrape_mayor(self):
        page = self.lxmlize(MAYOR_PAGE, 'iso-8859-1')

        name = page.xpath(
            '//div[@class="articletitle"]/h1')[0].text_content().replace(
                'Mayor', '')

        p = Person(primary_org='legislature',
                   name=name,
                   district='Summerside',
                   role='Mayor')
        p.add_source(MAYOR_PAGE)
        p.image = page.xpath(
            '//div[@class="articlebody-inside"]/p/img/@src')[0].replace(
                '..', '')

        info = page.xpath('//div[@class="articlebody-inside"]/p')
        phone = re.findall(r'to (.*)', info[1].text_content())[0]
        address = info[3].text_content().replace(
            'by mail: ', '') + ' ' + info[4].text_content()
        email = self.get_email(info[5])

        p.add_contact('voice', phone, 'legislature')
        p.add_contact('address', address, 'legislature')
        p.add_contact('email', email)

        return p
Exemple #25
0
    def scrape_mayor(self, div):
        name = div.xpath('.//a')[0].text_content()
        url = div.xpath('.//a/@href')[0]
        page = self.lxmlize(url)
        contact_url = page.xpath('//a[@title="Joindre le maire"]/@href')[0]
        contact_page = self.lxmlize(contact_url)

        p = Person(primary_org='legislature',
                   name=name,
                   district='Saint-Jean-sur-Richelieu',
                   role='Maire')
        p.add_source(COUNCIL_PAGE)
        p.add_source(url)
        p.add_source(contact_url)

        p.image = div.xpath('./preceding-sibling::td//img/@src')[-1]

        contacts = contact_page.xpath(
            '//div[@id="ctl00_PlaceHolderMain_ctl01_ctl01__ControlWrapper_RichHtmlField"]//div/font/text()'
        )
        address = ' '.join(contacts[:4])
        phone = contacts[-3].split(':')[1].strip().replace(' ', '-')
        fax = contacts[-2].split(':')[1].strip().replace(' ', '-')
        p.add_contact('address', address, 'legislature')
        p.add_contact('voice', phone, 'legislature')
        p.add_contact('fax', fax, 'legislature')
        # mayor's email is a form
        return p
Exemple #26
0
    def scrape(self):
        page = self.lxmlize(COUNCIL_PAGE)

        mayor_info = page.xpath('//h2[contains(text(), "MAYOR")]//following-sibling::p')[0]
        yield self.scrape_mayor(mayor_info)

        wards = page.xpath('//h3')
        for ward in wards:
            district = re.sub('\AWARD \d+ - ', '', ward.text_content())
            councillors = ward.xpath('following-sibling::p')
            for councillor in councillors:
                name = councillor.xpath('./strong')[0].text_content()

                p = Person(primary_org='legislature', name=name, district=district, role='Councillor')
                p.add_source(COUNCIL_PAGE)

                info = councillor.xpath('./text()')
                address = info.pop(0)
                p.add_contact('address', address, 'legislature')

                # get phone numbers
                for line in info:
                    stuff = re.split(r'(\xbb)|(\xa0)', line)
                    tmp = [y for y in stuff if y and not re.match(r'\xa0', y)]
                    self.get_tel_numbers(tmp, p)

                email = self.get_email(councillor)
                p.add_contact('email', email)

                yield p
                if councillor == councillors[1]:
                    break
Exemple #27
0
    def scrape(self):
        page = self.lxmlize(COUNCIL_PAGE, 'iso-8859-1')

        councillors = page.xpath('//div[@id="PageContent"]/table/tbody/tr/td')
        assert len(councillors), 'No councillors found'
        for councillor in councillors:
            if not councillor.text_content().strip():
                continue
            if councillor == councillors[0]:
                district = 'Kirkland'
                role = 'Maire'
            else:
                district = councillor.xpath('.//h2')[0].text_content()
                district = re.search('- (.+)', district).group(1).strip()
                district = district.replace(' Ouest',
                                            ' ouest').replace(' Est', ' est')
                role = 'Conseiller'

            name = councillor.xpath('.//strong/text()')[0]

            phone = councillor.xpath(
                './/div[contains(text(), "#")]/text()')[0].replace(
                    'T ', '').replace(' ', '-').replace(',-#-', ' x')
            email = self.get_email(councillor)

            p = Person(primary_org='legislature',
                       name=name,
                       district=district,
                       role=role)
            p.add_source(COUNCIL_PAGE)
            p.add_contact('voice', phone, 'legislature')
            p.add_contact('email', email)
            p.image = councillor.xpath('.//img/@src')[0]
            yield p
Exemple #28
0
    def scrape(self):
        councillor_seat_number = 1

        page = self.lxmlize(COUNCIL_PAGE)
        nodes = page.xpath('//div[@class="view-content"]/div')
        for node in nodes:
            fields = node.xpath('./div')
            role = fields[0].xpath('./div//text()')[0]
            name = fields[2].xpath('.//a//text()')[0].title().split(role)[-1].strip()
            if name == 'Vacant':
                continue

            if 'Ward' in role:
                district = role
                role = 'Councillor'
            else:
                if 'At Large' in role:
                    role = 'Councillor at Large'
                    district = "St. John's (seat {})".format(councillor_seat_number)
                    councillor_seat_number += 1
                else:
                    district = "St. John's"
            phone = fields[3].xpath('./div//text()')[0]
            email = self.get_email(fields[5])
            photo_url = node.xpath('.//img/@src')[0]

            p = Person(primary_org='legislature', name=name, district=district, role=role)
            p.add_source(COUNCIL_PAGE)
            p.add_contact('voice', phone, 'legislature')
            p.add_contact('email', email)
            p.image = photo_url
            yield p
Exemple #29
0
    def scrape(self):
        page = self.lxmlize(COUNCIL_PAGE)

        councillors = page.xpath('//div[@class="block text"]')
        assert len(councillors), 'No councillors found'
        for i, councillor in enumerate(councillors):
            name = councillor.xpath(
                './/div[@class="content-writable"]//strong/text()')[0]
            district = councillor.xpath('.//h2/text()')[0]

            if 'Maire' in district:
                district = 'Sainte-Anne-de-Bellevue'
                role = 'Maire'
            else:
                district = 'District {}'.format(re.search(r'\d+', district)[0])
                role = 'Conseiller'

            p = Person(primary_org='legislature',
                       name=name,
                       district=district,
                       role=role)
            p.add_source(COUNCIL_PAGE)

            p.image = councillor.xpath('.//@src')[0]
            p.add_contact('email', self.get_email(councillor))
            yield p
Exemple #30
0
    def scrape(self):
        member_page = self.lxmlize(COUNCIL_PAGE, encoding='utf-8')
        table = member_page.xpath('//table')[0]
        rows = table.xpath('.//tr')[1:]
        assert len(rows), 'No members found'
        for row in rows:
            (namecell, constitcell, partycell) = row.xpath('.//td')
            full_name = namecell.text_content().strip()
            if full_name.lower() == 'vacant':
                continue
            (last, first) = full_name.split(',')
            name = first.replace('Hon.',
                                 '').strip() + ' ' + last.title().strip()
            district = ' '.join(constitcell.text_content().split())
            party = get_party(partycell.text)

            url = namecell.xpath('.//a')[0].get('href')

            page = self.lxmlize(url)
            email = self.get_email(page)

            p = Person(primary_org='legislature',
                       name=name,
                       district=district,
                       role='MLA',
                       party=party)
            p.add_source(COUNCIL_PAGE)
            p.add_source(url)
            p.add_contact('email', email)

            image = page.xpath('//img[@class="page_graphic"]/@src')
            if image:
                p.image = image[0]

            yield p