Example #1
0
    def scrape(self):
        page = self.lxmlize(COUNCIL_PAGE)

        councillors = page.xpath('//table[@id="MLAs"]//tr')[1:]
        for councillor in councillors:
            if 'Vacant' not in councillor.xpath('./td')[0].text_content():
                name = councillor.xpath('./td')[0].text_content().split('. ', 1)[1]
                party = councillor.xpath('./td')[1].text
                district = councillor.xpath('./td')[2].text_content()
                url = councillor.xpath('./td[1]/a/@href')[0]
                page = self.lxmlize(url)

                p = Person(primary_org='legislature', name=name, district=district, role='MLA', party=party)
                p.add_source(COUNCIL_PAGE)
                p.add_source(url)
                p.image = page.xpath('//div[contains(@class, "mla-image-cell")]/img/@src')[0]

                contact = page.xpath('//div[@id="mla-contact"]/div[2]')[0]
                website = contact.xpath('./div[3]/div[3]/div[2]/a')
                if website:
                    p.add_link(website[0].text_content())

                p.add_contact('address', ' '.join(contact.xpath('.//div[@class="col-md-4"][2]/div//text()')[1:9]), 'constituency')
                phone_leg = contact.xpath('.//span[@id="MainContent_ContentBottom_Property6"]//text()')[0]
                phone_const = contact.xpath('.//div[@class="col-md-4"]/div[4]/span/span/text()')[0]
                p.add_contact('voice', phone_leg, 'legislature', area_code=306)
                p.add_contact('voice', phone_const, 'constituency', area_code=306)
                email = self.get_email(contact)
                p.add_contact('email', email)

                yield p
Example #2
0
    def scrape(self):
        page = self.lxmlize(COUNCIL_PAGE)

        councillors = page.xpath('//div[@class="entry-content"]//p/strong')
        for councillor in councillors:
            district = councillor.xpath('./ancestor::p/preceding-sibling::h2')[-1].text_content().split('–'.decode('utf-8'))[0]
            name = ' '.join(councillor.text_content().split()[-2:]).replace('-Â'.decode('utf-8'), '')
            role = councillor.text_content().replace(name, '').split('-')[0]
            if 'SAO' in role or not role:
                continue

            org = Organization(name=district + ' Municipal Council', classification='legislature', jurisdiction_id=self.jurisdiction.jurisdiction_id)
            org.add_source(COUNCIL_PAGE)
            yield org

            p = Person(primary_org='legislature', name=name, district=district)
            p.add_source(COUNCIL_PAGE)
            membership = p.add_membership(org, role=role, district=district)

            info = councillor.xpath('./ancestor::p/text()')
            for contact in info:
                if 'NT' in contact:
                    membership.add_contact_detail('address', contact.strip(), 'legislature')
                if 'Tel' in contact:
                    contact = contact.replace('Tel. ', '').replace('(', '').replace(') ', '-').strip()
                    membership.add_contact_detail('voice', contact, 'legislature')
                if 'Fax' in contact:
                    contact = contact.replace('Fax ', '').replace('(', '').replace(') ', '-').strip()
                    membership.add_contact_detail('fax', contact, 'legislature')
            email = self.get_email(councillor, './parent::p')
            membership.add_contact_detail('email', email)

            if 'Website' in councillor.xpath('./parent::p')[0].text_content():
                p.add_link(councillor.xpath('./parent::p//a')[1].attrib['href'])
            yield p
Example #3
0
    def scrape(self):
        regional_councillor_seat_number = 1

        page = self.lxmlize(COUNCIL_PAGE)

        councillors = page.xpath('//center/center//a')
        for councillor in councillors:
            name = councillor.text_content().strip()
            url = councillor.attrib['href']
            page = self.lxmlize(url)
            header = page.xpath(
                '//div[@class="sectionheading"]')[0].text_content()
            if header == 'Mayor of Richmond Hill':
                district = 'Richmond Hill'
                role = 'Mayor'
            else:
                district = re.findall(r',(.*)-', header)
                if district:
                    district = district[0].strip()
                else:
                    district = 'Richmond Hill (seat {})'.format(
                        regional_councillor_seat_number)
                    regional_councillor_seat_number += 1

                role = 'Regional Councillor' if 'Regional' in header else 'Councillor'

            info = page.xpath(
                '//table[@cellpadding>0]/tbody/tr/td[last()]|//table[not(@cellpadding)]/tbody/tr/td[last()]'
            )
            info = info[0].text_content().replace(' - office:', ':')

            address = re.findall(
                r'(?<=Town of Richmond Hill)(.*(?=Telephone:)|(?=Telephone))',
                info)[0]
            address = re.sub(r'([a-z])([A-Z])', r'\1 \2', address)
            # I expected to be able to do '(.*)(?=\sTelephone|Telephone|Fax)', but nope.
            phone = re.findall(
                r'(?<=Telephone:) ((.*) (?=Telephone)|(.*)(?=Telephone)|(.*)(?=Fax))',
                info)[0][0].replace('(',
                                    '').replace(') ',
                                                '-').replace(', ext. ', ' x')
            fax = re.findall(r'(?<=Fax:) (.*)(?=E-mail)', info)[0].replace(
                ' ', '').replace('(', '').replace(')', '-')
            email = self.get_email(page)

            p = Person(primary_org='legislature',
                       name=name,
                       district=district,
                       role=role)
            p.add_source(COUNCIL_PAGE)
            p.add_source(url)
            p.add_contact('address', address, 'legislature')
            p.add_contact('voice', phone, 'legislature')
            p.add_contact('fax', fax, 'legislature')
            p.add_contact('email', email)
            p.image = page.xpath(
                '//img[contains(@alt, "{}")]/@src'.format(name))[0]
            if 'Website' in info:
                p.add_link(re.findall(r'www\..*\.[a-z]+', info)[0])
            yield p
Example #4
0
    def scrape(self):
        seat_numbers = defaultdict(int)

        page = self.lxmlize(COUNCIL_PAGE)

        yield self.scrape_mayor()

        councillors = page.xpath('//div[@id="centre_content"]//tr')
        for councillor in councillors:
            if 'Position' in councillor.text_content():
                continue

            ward = councillor.xpath('./td')[0].text_content().replace('Councillor', '')
            seat_numbers[ward] += 1
            district = '{} (seat {})'.format(ward, seat_numbers[ward])
            name = councillor.xpath('./td')[1].text_content()
            url = councillor.xpath('./td/a')[0].attrib['href']

            p = Person(primary_org='legislature', name=name, district=district, role='Councillor')
            p.add_source(COUNCIL_PAGE)
            p.add_source(url)

            page = self.lxmlize(url)

            content = page.xpath('//div[@id="centre_content"]')[0]
            email = self.get_email(content)
            p.add_contact('email', email)
            p.add_contact('voice', self.get_phone(content, area_codes=[226, 519]), 'legislature')

            p.image = page.xpath('string(//div[@id="centre_content"]//img/@src)')  # can be empty

            if len(page.xpath('//div[@id="centre_content"]//a')) > 2:
                p.add_link(page.xpath('//div[@id="centre_content"]//a')[-1].attrib['href'])
            yield p
Example #5
0
    def scrape(self):
        page = self.lxmlize(COUNCIL_PAGE)

        councillors = page.xpath('//div[@id="printArea"]//table//tr//td')[4:-1]
        yield self.scrape_mayor(councillors[0])
        for councillor in councillors[1:]:
            name = ' '.join(councillor.xpath('.//strong/a[last()]//text()')[0].split())
            infostr = councillor.xpath('.//strong//text()')[0]
            try:
                district = infostr.split('-')[1]
                role = 'Councillor'
            except IndexError:
                district = 'Newmarket'
                role = 'Regional Councillor'
            url = councillor.xpath('.//a/@href')[0]

            p = Person(primary_org='legislature', name=name, district=district, role=role)
            p.add_source(COUNCIL_PAGE)
            p.add_source(url)

            p.image = councillor.xpath('.//img/@src')[0]

            page = self.lxmlize(url)
            info = page.xpath('//div[@id="printArea"]')[0]
            info = info.xpath('.//p[@class="heading"][2]/following-sibling::p')
            address = info.pop(0).text_content().strip()
            if not address:
                address = info.pop(0).text_content().strip()

            if 'Ward' in info[0].text_content():
                info.pop(0)

            numbers = info.pop(0).text_content().split(':')
            email = self.get_email(page)
            p.add_contact('email', email)
            for i, contact in enumerate(numbers):
                if i == 0:
                    continue
                if '@' in contact:
                    continue  # executive assistant email
                else:
                    number = re.findall(r'([0-9]{3}-[0-9]{3}-[0-9]{4})', contact)[0]
                    ext = re.findall(r'(Ext\. [0-9]{3,4})', contact)
                    if ext:
                        number = number + ext[0].replace('Ext. ', ' x')
                    contact_type = re.findall(r'[A-Za-z]+$', numbers[i - 1])[0]
                if 'Fax' in contact_type:
                    p.add_contact('fax', number, 'legislature')
                elif 'Phone' in contact_type:
                    p.add_contact('voice', number, 'legislature')
                else:
                    p.add_contact(contact_type, number, contact_type)
            site = page.xpath('.//a[contains(text(), "http://")]')
            if site:
                p.add_link(site[0].text_content())
            yield p
Example #6
0
    def scrape(self):
        page = self.lxmlize(COUNCIL_PAGE)

        members = page.xpath('//table[@id="MLAs"]//tr')[1:]
        assert len(members), 'No members found'
        for member in members:
            if 'Vacant' not in member.xpath('./td')[0].text_content():
                name = member.xpath('./td')[0].text_content().split('. ', 1)[1]
                party = member.xpath('./td')[1].text
                district = member.xpath('./td')[2].text_content()
                url = member.xpath('./td[1]/a/@href')[0]
                page = self.lxmlize(url)

                p = Person(primary_org='legislature',
                           name=name,
                           district=district,
                           role='MLA',
                           party=party)
                p.add_source(COUNCIL_PAGE)
                p.add_source(url)
                p.image = page.xpath(
                    '//div[contains(@class, "mla-image-cell")]/img/@src')[0]

                contact = page.xpath('//div[@id="mla-contact"]/div[2]')[0]
                website = contact.xpath('./div[3]/div[3]/div[2]/a')
                if website:
                    p.add_link(website[0].text_content())

                p.add_contact(
                    'address', ' '.join(
                        contact.xpath(
                            './/div[@class="col-md-4"][2]/div//text()')[1:9]),
                    'constituency')

                phone_leg = contact.xpath(
                    './/span[@id="MainContent_ContentBottom_Property6"]//text()'
                )
                if phone_leg:
                    p.add_contact('voice',
                                  phone_leg[0],
                                  'legislature',
                                  area_code=306)

                phone_const = contact.xpath(
                    './/div[@class="col-md-4"]/div[4]/span/span/text()')
                if phone_const:
                    p.add_contact('voice',
                                  phone_const[0],
                                  'constituency',
                                  area_code=306)

                email = self.get_email(contact, error=False)
                if email:
                    p.add_contact('email', email)

                yield p
Example #7
0
    def scrape(self):
        regional_councillor_seat_number = 1

        page = self.lxmlize(COUNCIL_PAGE)

        mayor_url = page.xpath(
            '//a[contains(text(), "Office of the Mayor")]/@href')[0]
        yield self.scrape_mayor(mayor_url)

        councillors = page.xpath(
            '//div[@class="interiorContentWrapper"]//td[./a]')
        assert len(councillors), 'No councillors found'
        for councillor in councillors:
            name_elem = ' '.join(councillor.xpath('.//strong/text()'))
            if 'Mayor' in name_elem:
                name = name_elem.split('Mayor')[1]
            elif 'Councillor' in name_elem:
                name = name_elem.split('Councillor')[1]
            else:
                name = name_elem

            district = councillor.xpath('.//a//text()[normalize-space()]')[0]
            if 'Ward' in district:
                district = district.replace('Councillor', '')
                role = 'Councillor'
            elif 'Regional' in district:
                role = 'Regional Councillor'
                district = 'Markham (seat {})'.format(
                    regional_councillor_seat_number)
                regional_councillor_seat_number += 1
            else:
                role = district
                district = 'Markham'

            image = councillor.xpath('.//img/@src')[0]
            url = councillor.xpath('.//a/@href')[0]

            address, phone, email, links = self.get_contact(url)

            p = Person(primary_org='legislature',
                       name=name,
                       district=district,
                       role=role)
            p.add_source(COUNCIL_PAGE)
            p.add_source(url)

            p.image = image
            p.add_contact('address', address, 'legislature')
            p.add_contact('voice', phone, 'legislature')
            p.add_contact('email', email)

            for link in links:
                p.add_link(link)

            yield p
Example #8
0
    def scrape(self):
        page = self.lxmlize(COUNCIL_PAGE)

        councillors = page.xpath(
            '//ul[@class="subNav top"]/li/ul//li/a[contains(text(), "Councillor")] | //ul[@class="subNav top"]/li/ul//li/a[contains(text(), "Mayor")]'
        )
        assert len(councillors), 'No councillors found'
        for councillor in councillors:
            name = councillor.text_content()

            url = councillor.attrib['href']
            page = self.lxmlize(url)

            if councillor == councillors[0]:
                district = 'Ajax'
                role = 'Mayor'
            else:
                district = re.findall(
                    r'Ward.*',
                    page.xpath('//div[@id="printAreaContent"]//h1')
                    [0].text_content())[0].strip()
                role = page.xpath(
                    '//div[@id="printAreaContent"]//h1')[0].text_content()
                role = re.search('((?:Regional )?Councillor)', role).group(1)

            name = name.replace(role, '').strip()

            p = Person(primary_org='legislature',
                       name=name,
                       district=district,
                       role=role)
            p.add_source(COUNCIL_PAGE)
            p.add_source(url)

            p.image = page.xpath(
                '//div[@class="intQuicklinksPhoto"]//img/@src')[0]

            contact_info = page.xpath('//table[@class="datatable"][1]//tr')[1:]
            for line in contact_info:
                contact_type = line.xpath('./td')[0].text_content().strip()
                if re.match(r'(Home)|(Cell)|(Phone)|(Fax)|(Email)',
                            contact_type):
                    contact = line.xpath('./td')[1].text_content().strip()
                    contact_type = CONTACT_DETAIL_TYPE_MAP[contact_type]
                    p.add_contact(
                        contact_type, contact,
                        '' if contact_type == 'email' else 'legislature')
                elif contact_type == 'Address':
                    contact = ''.join(line.xpath('./td[2]//text()')).strip()
                    p.add_contact(contact_type, contact, 'legislature')
                else:
                    contact = line.xpath('./td[2]/a/@href')[0]
                    p.add_link(contact)
            yield p
Example #9
0
    def scrape(self):
        yield self.scrape_mayor()
        page = self.lxmlize(COUNCIL_PAGE)

        councillors = page.xpath(
            '//div[contains(@class, "documentexcerpt-module__item")]')
        assert len(councillors), 'No councillors found'
        for cell in councillors:
            name = cell[1].text
            if name == 'Vacant':
                continue

            page_url = cell[0].attrib['href']
            page = self.lxmlize(page_url)
            district_name = page.xpath(
                '//h1[contains(@class, "page-title")]')[0].text_content()
            district, name = district_name.split(' - ', 1)

            p = Person(primary_org='legislature',
                       name=name,
                       district=district,
                       role='Councillor')
            p.add_source(COUNCIL_PAGE)
            p.add_source(page_url)

            image = page.xpath('//div[contains(@class, "content")]//img/@src')
            if image:
                p.image = image[0]

            address = page.xpath('//address//p')
            if address:
                address = address[0].text_content()
                p.add_contact('address', address, 'legislature')

            contacts = page.xpath(
                '//table[@summary="Contact information"]//tr')
            for contact in contacts:
                contact_type = contact.xpath('./th/text()')[0]
                value = contact.xpath('./td//text()')[0]
                if 'Title' in contact_type:
                    continue
                elif 'Website' in contact_type or 'Facebook' in contact_type or 'Twitter' in contact_type:
                    value = contact.xpath('./td/a/text()')[0]
                    p.add_link(value)
                elif 'Telephone' in contact_type:
                    p.add_contact('voice', value, 'legislature')
                elif 'Fax' in contact_type:
                    p.add_contact('fax', value, 'legislature')
                elif 'Email' in contact_type:
                    p.add_contact('email', value)
            yield p
Example #10
0
    def scrape(self):
        page = self.lxmlize(COUNCIL_PAGE)
        types = page.xpath('//div[@class="bluearrow shaded bottomborder "][1]/ul/li/a/@href')[:4]
        for org_type, link in enumerate(types):
            page = self.lxmlize(link)
            district_urls = page.xpath('//div[@class="parbase list section cplist"]/table/tr/td[1]/b/a/@href')
            for district_url in district_urls:
                page = self.lxmlize(district_url)
                district = page.xpath('//div[@class="pageHeader"]/h1/text()')[0].split(' - ')[1].strip()

                org = Organization(name=district + org_types[org_type], classification='legislature', jurisdiction_id=self.jurisdiction.jurisdiction_id)
                org.add_source(district_url)
                yield org

                address = ', '.join(page.xpath('//div[@class="left_contents"]/p[1]/text()'))
                contacts = page.xpath('//div[@class="left_contents"]/p[b[text() = "Contact"]]/text()')
                phone = contacts[0].split(':')[1].strip().replace(' ', '-')
                fax = contacts[1].split(':')[1].strip().replace(' ', '-')
                email = self.get_email(page, '//div[@class="left_contents"]')

                site = page.xpath('//div[@class="left_contents"]//a[not(contains(@href,"mailto:"))]')
                if site:
                    site = site[0].text_content()

                councillors = page.xpath('//div[@class="right_contents"]//p/text()')
                for i, councillor in enumerate(councillors):
                    if 'Vacant' in councillor:
                        continue
                    p = Person(primary_org='legislature', name=councillor, district=district)
                    p.add_source(COUNCIL_PAGE)
                    p.add_source(link)
                    p.add_source(district_url)

                    if i == 0:
                        membership = p.add_membership(org, role='Mayor')
                    else:
                        membership = p.add_membership(org, role='Councillor')

                    membership.post_id = district
                    membership.add_contact_detail('address', address, 'legislature')
                    if phone:
                        membership.add_contact_detail('voice', phone, 'legislature')
                    if fax:
                        membership.add_contact_detail('fax', fax, 'legislature')
                    if email:
                        membership.add_contact_detail('email', email)
                    if site:
                        p.add_link(site)
                    yield p
Example #11
0
    def scrape(self):
        page = self.lxmlize(COUNCIL_PAGE)

        councillors = page.xpath('//div[@class="entry-content"]//p/strong')
        for councillor in councillors:
            district = councillor.xpath('./ancestor::p/preceding-sibling::h2'
                                        )[-1].text_content().split(
                                            '–'.decode('utf-8'))[0]
            name = ' '.join(councillor.text_content().split()[-2:]).replace(
                '-Â'.decode('utf-8'), '')
            role = councillor.text_content().replace(name, '').split('-')[0]
            if 'SAO' in role or not role:
                continue

            org = Organization(
                name=district + ' Municipal Council',
                classification='legislature',
                jurisdiction_id=self.jurisdiction.jurisdiction_id)
            org.add_source(COUNCIL_PAGE)
            yield org

            p = Person(primary_org='legislature', name=name, district=district)
            p.add_source(COUNCIL_PAGE)
            membership = p.add_membership(org, role=role, district=district)

            info = councillor.xpath('./ancestor::p/text()')
            for contact in info:
                if 'NT' in contact:
                    membership.add_contact_detail('address', contact.strip(),
                                                  'legislature')
                if 'Tel' in contact:
                    contact = contact.replace('Tel. ',
                                              '').replace('(', '').replace(
                                                  ') ', '-').strip()
                    membership.add_contact_detail('voice', contact,
                                                  'legislature')
                if 'Fax' in contact:
                    contact = contact.replace('Fax ',
                                              '').replace('(', '').replace(
                                                  ') ', '-').strip()
                    membership.add_contact_detail('fax', contact,
                                                  'legislature')
            email = self.get_email(councillor, './parent::p')
            membership.add_contact_detail('email', email)

            if 'Website' in councillor.xpath('./parent::p')[0].text_content():
                p.add_link(
                    councillor.xpath('./parent::p//a')[1].attrib['href'])
            yield p
Example #12
0
    def scrape(self):
        regional_councillor_seat_number = 1

        page = self.lxmlize(COUNCIL_PAGE)

        mayor_url = page.xpath('//a[contains(text(), "Office of the Mayor")]/@href')[0]
        yield self.scrape_mayor(mayor_url)

        councillors = page.xpath('//div[@class="interiorContentWrapper"]//td[./a]')
        assert len(councillors), 'No councillors found'
        for councillor in councillors:
            name_elem = ' '.join(councillor.xpath('.//strong/text()'))
            if 'Mayor' in name_elem:
                name = name_elem.split('Mayor')[1]
            elif 'Councillor' in name_elem:
                name = name_elem.split('Councillor')[1]
            else:
                name = name_elem

            district = councillor.xpath('.//a//text()[normalize-space()]')[0]
            if 'Ward' in district:
                district = district.replace('Councillor', '')
                role = 'Councillor'
            elif 'Regional' in district:
                role = 'Regional Councillor'
                district = 'Markham (seat {})'.format(regional_councillor_seat_number)
                regional_councillor_seat_number += 1
            else:
                role = district
                district = 'Markham'

            image = councillor.xpath('.//img/@src')[0]
            url = councillor.xpath('.//a/@href')[0]

            address, phone, email, links = self.get_contact(url)

            p = Person(primary_org='legislature', name=name, district=district, role=role)
            p.add_source(COUNCIL_PAGE)
            p.add_source(url)

            p.image = image
            p.add_contact('address', address, 'legislature')
            p.add_contact('voice', phone, 'legislature')
            p.add_contact('email', email)

            for link in links:
                p.add_link(link)

            yield p
Example #13
0
    def scrape(self):
        page = self.lxmlize(COUNCIL_PAGE)

        mayor_contacts = page.xpath('//table[1]//tr/td[1]/text()')
        council_contacts = page.xpath('//table[1]//tr/td[2]/text()')

        councillors = page.xpath(
            '//table[@id="Table3table"]//img/ancestor::td')
        assert len(councillors), 'No councillors found'
        for councillor in councillors:
            name = councillor.xpath('.//strong//text()')[0]

            if 'Councillor' in name:
                name = name.replace('Councillor', '').strip()
                role_ward = councillor.xpath('./text()')[0]
                if not role_ward.strip():
                    role_ward = councillor.xpath('.//p/text()')[0]
                role_ward = role_ward.split(' ')
                role = re.sub('\ACity ', '', ' '.join(role_ward[:2]))
                ward = ' '.join(role_ward[2:])
            else:
                name = councillor.xpath('.//strong/text()')[1]
                role = 'Mayor'
                ward = 'Pickering'

            email = self.get_email(councillor)
            p = Person(primary_org='legislature',
                       name=name,
                       district=ward,
                       role=role)
            p.add_source(COUNCIL_PAGE)
            p.add_contact('email', email)
            p.image = councillor.xpath('.//img/@src')[0]

            links = councillor.xpath('.//a')
            for link in links:
                if '@' in link.text_content():
                    continue
                if 'Profile' in link.text_content():
                    p.add_source(link.attrib['href'])
                else:
                    p.add_link(link.attrib['href'])

            if role == 'Mayor':
                add_contacts(p, mayor_contacts)
            else:
                add_contacts(p, council_contacts)
            yield p
Example #14
0
    def scrape(self):
        page = self.lxmlize(COUNCIL_PAGE)

        districts = page.xpath('//div[@id="left-content" or @id="right-content"]//a')
        for district in districts:
            url = district.attrib['href']
            page = self.lxmlize(url)

            org = Organization(name=district.text_content() + ' Council', classification='legislature', jurisdiction_id=self.jurisdiction.jurisdiction_id)
            org.add_source(url)
            yield org

            info = page.xpath('//div[@style="WIDTH:750"]/dl')
            for contact in info:
                contact_type = contact.xpath('./dt')[0].text_content()
                contact = contact.xpath('./dd')[0].text_content().replace('(', '').replace(') ', '-')
                if 'Officials' in contact_type:
                    break
                if 'Tel' in contact_type:
                    phone = contact
                if 'Fac' in contact_type:
                    fax = contact
                if 'Address' in contact_type:
                    address = contact
                if 'Email' in contact_type:
                    email = contact
                if 'Website' in contact_type:
                    site = contact

            councillors = page.xpath('//div[@style="WIDTH:750"]/dl/dt[contains(text(), "Elected Officials")]/parent::dl/dd/pre/text()')[0].splitlines(True)
            for councillor in councillors:
                name = councillor.replace('(Mayor)', '').replace('(Deputy Mayor)', '').replace('(Chairperson)', '').strip()
                role = re.sub(r'\(|\)', '', councillor.replace(name, '').strip())
                if not role:
                    role = 'Councillor'
                p = Person(primary_org='legislature', name=name, district=district.text_content())
                p.add_source(COUNCIL_PAGE)
                p.add_source(url)
                membership = p.add_membership(org, role=role, district=district.text_content())
                membership.add_contact_detail('voice', self.clean_telephone_number(phone), 'legislature')
                membership.add_contact_detail('fax', self.clean_telephone_number(fax), 'legislature')
                membership.add_contact_detail('address', self.clean_address(address), 'legislature')
                membership.add_contact_detail('email', email)
                if site:
                    p.add_link(site)
                yield p
Example #15
0
    def scrape(self):
        seat_numbers = defaultdict(int)

        page = self.lxmlize(COUNCIL_PAGE)

        yield self.scrape_mayor()

        councillors = page.xpath('//div[@id="centre_content"]//tr')
        assert len(councillors), 'No councillors found'
        for councillor in councillors:
            if 'Position' in councillor.text_content():
                continue

            ward = councillor.xpath('./td')[0].text_content().replace(
                'Councillor', '')
            seat_numbers[ward] += 1
            district = '{} (seat {})'.format(ward, seat_numbers[ward])
            name = councillor.xpath('./td')[1].text_content()
            url = councillor.xpath('./td/a')[0].attrib['href']

            p = Person(primary_org='legislature',
                       name=name,
                       district=district,
                       role='Councillor')
            p.add_source(COUNCIL_PAGE)
            p.add_source(url)

            page = self.lxmlize(url)

            content = page.xpath('//div[@id="centre_content"]')[0]
            email = self.get_email(content)
            p.add_contact('email', email)
            p.add_contact('voice',
                          self.get_phone(content, area_codes=[226, 519]),
                          'legislature')

            p.image = page.xpath(
                'string(//div[@id="centre_content"]//img/@src)'
            )  # can be empty

            if len(page.xpath('//div[@id="centre_content"]//a')) > 2:
                p.add_link(
                    page.xpath('//div[@id="centre_content"]//a')
                    [-1].attrib['href'])
            yield p
Example #16
0
    def scrape(self):
        yield self.scrape_mayor()

        page = self.lxmlize(COUNCIL_PAGE)
        councillors = page.xpath('//th[contains(text(), "Ward")]')
        assert len(councillors), 'No councillors found'
        for cell in councillors:
            district = cell.text
            name = cell[1].text
            if name != 'Vacant':
                page_url = cell[1].attrib['href']
                page = self.lxmlize(page_url)

                p = Person(primary_org='legislature',
                           name=name,
                           district=district,
                           role='Councillor')
                p.add_source(COUNCIL_PAGE)
                p.add_source(page_url)

                image = page.xpath('//div[@id="contentArea"]//img/@src')
                if image:
                    p.image = image[0]

                address = page.xpath('//address//p')
                if address:
                    address = address[0].text_content()
                    p.add_contact('address', address, 'legislature')

                contacts = page.xpath('//table[@class="contactListing"]//tr')
                for contact in contacts:
                    contact_type = contact.xpath('./th/text()')[0]
                    value = contact.xpath('./td//text()')[0]
                    if 'Title' in contact_type:
                        continue
                    elif 'Website' in contact_type or 'Facebook' in contact_type or 'Twitter' in contact_type:
                        value = contact.xpath('./td/a/text()')[0]
                        p.add_link(value)
                    elif 'Telephone' in contact_type:
                        p.add_contact('voice', value, 'legislature')
                    elif 'Fax' in contact_type:
                        p.add_contact('fax', value, 'legislature')
                    elif 'Email' in contact_type:
                        p.add_contact('email', value)
                yield p
Example #17
0
    def scrape(self):
        regional_councillor_seat_number = 1

        page = self.lxmlize(COUNCIL_PAGE)

        councillors = page.xpath('//center/center//a')
        for councillor in councillors:
            name = councillor.text_content().strip()
            url = councillor.attrib['href']
            page = self.lxmlize(url)
            header = page.xpath('//div[@class="sectionheading"]')[0].text_content()
            if header == 'Mayor of Richmond Hill':
                district = 'Richmond Hill'
                role = 'Mayor'
            else:
                district = re.findall(r',(.*)-', header)
                if district:
                    district = district[0].strip()
                else:
                    district = 'Richmond Hill (seat {})'.format(regional_councillor_seat_number)
                    regional_councillor_seat_number += 1

                role = 'Regional Councillor' if 'Regional' in header else 'Councillor'

            info = page.xpath('//table[@cellpadding>0]/tbody/tr/td[last()]|//table[not(@cellpadding)]/tbody/tr/td[last()]')
            info = info[0].text_content().replace(' - office:', ':')

            address = re.findall(r'(?<=Town of Richmond Hill)(.*(?=Telephone:)|(?=Telephone))', info)[0]
            address = re.sub(r'([a-z])([A-Z])', r'\1 \2', address)
            # I expected to be able to do '(.*)(?=\sTelephone|Telephone|Fax)', but nope.
            phone = re.findall(r'(?<=Telephone:) ((.*) (?=Telephone)|(.*)(?=Telephone)|(.*)(?=Fax))', info)[0][0].replace('(', '').replace(') ', '-').replace(', ext. ', ' x')
            fax = re.findall(r'(?<=Fax:) (.*)(?=E-mail)', info)[0].replace(' ', '').replace('(', '').replace(')', '-')
            email = self.get_email(page)

            p = Person(primary_org='legislature', name=name, district=district, role=role)
            p.add_source(COUNCIL_PAGE)
            p.add_source(url)
            p.add_contact('address', address, 'legislature')
            p.add_contact('voice', phone, 'legislature')
            p.add_contact('fax', fax, 'legislature')
            p.add_contact('email', email)
            p.image = page.xpath('//img[contains(@alt, "{}")]/@src'.format(name))[0]
            if 'Website' in info:
                p.add_link(re.findall(r'www\..*\.[a-z]+', info)[0])
            yield p
Example #18
0
    def scrape(self):
        page = self.lxmlize(COUNCIL_PAGE)

        mayor_contacts = page.xpath('//table[1]//tr/td[1]/text()')
        council_contacts = page.xpath('//table[1]//tr/td[2]/text()')

        councillors = page.xpath('//table[@id="Table3table"]//img/ancestor::td')
        assert len(councillors), 'No councillors found'
        for councillor in councillors:
            name = councillor.xpath('.//strong//text()')[0]

            if 'Councillor' in name:
                name = name.replace('Councillor', '').strip()
                role_ward = councillor.xpath('./text()')[0]
                if not role_ward.strip():
                    role_ward = councillor.xpath('.//p/text()')[0]
                role_ward = role_ward.split(' ')
                role = re.sub(r'\ACity ', '', ' '.join(role_ward[:2]))
                ward = ' '.join(role_ward[2:])
            else:
                name = councillor.xpath('.//strong/text()')[1]
                role = 'Mayor'
                ward = 'Pickering'

            email = self.get_email(councillor)
            p = Person(primary_org='legislature', name=name, district=ward, role=role)
            p.add_source(COUNCIL_PAGE)
            p.add_contact('email', email)
            p.image = councillor.xpath('.//img/@src')[0]

            links = councillor.xpath('.//a')
            for link in links:
                if '@' in link.text_content():
                    continue
                if 'Profile' in link.text_content():
                    p.add_source(link.attrib['href'])
                else:
                    p.add_link(link.attrib['href'])

            if role == 'Mayor':
                add_contacts(p, mayor_contacts)
            else:
                add_contacts(p, council_contacts)
            yield p
Example #19
0
    def scrape(self):
        page = self.lxmlize(COUNCIL_PAGE)

        councillors = page.xpath(
            '//ul[@class="subNav top"]/li/ul//li/a[contains(text(), "Councillor")] | //ul[@class="subNav top"]/li/ul//li/a[contains(text(), "Mayor")]'
        )
        for councillor in councillors:
            name = councillor.text_content()

            url = councillor.attrib["href"]
            page = self.lxmlize(url)

            if councillor == councillors[0]:
                district = "Ajax"
                role = "Mayor"
            else:
                district = re.findall(r"Ward.*", page.xpath('//div[@id="printAreaContent"]//h1')[0].text_content())[
                    0
                ].strip()
                role = page.xpath('//div[@id="printAreaContent"]//h1')[0].text_content()
                role = re.search("((?:Regional )?Councillor)", role).group(1)

            name = name.replace(role, "").strip()

            p = Person(primary_org="legislature", name=name, district=district, role=role)
            p.add_source(COUNCIL_PAGE)
            p.add_source(url)

            p.image = page.xpath('//div[@class="intQuicklinksPhoto"]//img/@src')[0]

            contact_info = page.xpath('//table[@class="datatable"][1]//tr')[1:]
            for line in contact_info:
                contact_type = line.xpath("./td")[0].text_content().strip()
                if re.match(r"(Home)|(Cell)|(Phone)|(Fax)|(Email)", contact_type):
                    contact = line.xpath("./td")[1].text_content().strip()
                    contact_type = CONTACT_DETAIL_TYPE_MAP[contact_type]
                    p.add_contact(contact_type, contact, "" if contact_type == "email" else "legislature")
                elif contact_type == "Address":
                    contact = "".join(line.xpath("./td[2]//text()")).strip()
                    p.add_contact(contact_type, contact, "legislature")
                else:
                    contact = line.xpath("./td[2]/a/@href")[0]
                    p.add_link(contact)
            yield p
Example #20
0
    def scrape(self):
        yield self.scrape_mayor()
        page = self.lxmlize(COUNCIL_PAGE)

        councillors = page.xpath('//div[contains(@class, "documentexcerpt-module__item")]')
        assert len(councillors), 'No councillors found'
        for cell in councillors:
            name = cell[1].text
            if name != 'Vacant':
                page_url = cell[0].attrib['href']
                page = self.lxmlize(page_url)
                district_name = page.xpath('//h1[contains(@class, "page-title")]')[0].text_content()
                district, name = district_name.split(' - ', 1)

                p = Person(primary_org='legislature', name=name, district=district, role='Councillor')
                p.add_source(COUNCIL_PAGE)
                p.add_source(page_url)

                image = page.xpath('//div[contains(@class, "content")]//img/@src')
                if image:
                    p.image = image[0]

                address = page.xpath('//address//p')
                if address:
                    address = address[0].text_content()
                    p.add_contact('address', address, 'legislature')

                contacts = page.xpath('//table[@summary="Contact information"]//tr')
                for contact in contacts:
                    contact_type = contact.xpath('./th/text()')[0]
                    value = contact.xpath('./td//text()')[0]
                    if 'Title' in contact_type:
                        continue
                    elif 'Website' in contact_type or 'Facebook' in contact_type or 'Twitter' in contact_type:
                        value = contact.xpath('./td/a/text()')[0]
                        p.add_link(value)
                    elif 'Telephone' in contact_type:
                        p.add_contact('voice', value, 'legislature')
                    elif 'Fax' in contact_type:
                        p.add_contact('fax', value, 'legislature')
                    elif 'Email' in contact_type:
                        p.add_contact('email', value)
                yield p
Example #21
0
    def scrape(self):
        yield self.scrape_mayor()

        page = self.lxmlize(COUNCIL_PAGE)
        councillor_cells = page.xpath('//th[contains(text(), "Ward")]')
        for cell in councillor_cells:
            district = cell.text
            name = cell[1].text
            if name != 'Vacant':
                page_url = cell[1].attrib['href']
                page = self.lxmlize(page_url)

                p = Person(primary_org='legislature', name=name, district=district, role='Councillor')
                p.add_source(COUNCIL_PAGE)
                p.add_source(page_url)

                image = page.xpath('//div[@id="contentArea"]//img/@src')
                if image:
                    p.image = image[0]

                address = page.xpath('//address//p')
                if address:
                    address = address[0].text_content()
                    p.add_contact('address', address, 'legislature')

                contacts = page.xpath('//table[@class="contactListing"]//tr')
                for contact in contacts:
                    contact_type = contact.xpath('./th/text()')[0]
                    value = contact.xpath('./td//text()')[0]
                    if 'Title' in contact_type:
                        continue
                    elif 'Website' in contact_type or 'Facebook' in contact_type or 'Twitter' in contact_type:
                        value = contact.xpath('./td/a/text()')[0]
                        p.add_link(value)
                    elif 'Telephone' in contact_type:
                        p.add_contact('voice', value, 'legislature')
                    elif 'Fax' in contact_type:
                        p.add_contact('fax', value, 'legislature')
                    elif 'Email' in contact_type:
                        p.add_contact('email', value)
                yield p
Example #22
0
    def scrape(self):
        page = self.lxmlize(COUNCIL_PAGE)

        councillors = page.xpath('//ul[@class="subNav top"]/li/ul//li/a[contains(text(), "Councillor")] | //ul[@class="subNav top"]/li/ul//li/a[contains(text(), "Mayor")]')
        assert len(councillors), 'No councillors found'
        for councillor in councillors:
            name = councillor.text_content()

            url = councillor.attrib['href']
            page = self.lxmlize(url)

            if councillor == councillors[0]:
                district = 'Ajax'
                role = 'Mayor'
            else:
                district = re.findall(r'Ward.*', page.xpath('//div[@id="printAreaContent"]//h1')[0].text_content())[0].strip()
                role = page.xpath('//div[@id="printAreaContent"]//h1')[0].text_content()
                role = re.search('((?:Regional )?Councillor)', role).group(1)

            name = name.replace(role, '').strip()

            p = Person(primary_org='legislature', name=name, district=district, role=role)
            p.add_source(COUNCIL_PAGE)
            p.add_source(url)

            p.image = page.xpath('//div[@class="intQuicklinksPhoto"]//img/@src')[0]

            contact_info = page.xpath('//table[@class="datatable"][1]//tr')[1:]
            for line in contact_info:
                contact_type = line.xpath('./td')[0].text_content().strip()
                if re.match(r'(Home)|(Cell)|(Phone)|(Fax)|(Email)', contact_type):
                    contact = line.xpath('./td')[1].text_content().strip()
                    contact_type = CONTACT_DETAIL_TYPE_MAP[contact_type]
                    p.add_contact(contact_type, contact, '' if contact_type == 'email' else 'legislature')
                elif contact_type == 'Address':
                    contact = ''.join(line.xpath('./td[2]//text()')).strip()
                    p.add_contact(contact_type, contact, 'legislature')
                else:
                    contact = line.xpath('./td[2]/a/@href')[0]
                    p.add_link(contact)
            yield p
Example #23
0
    def scrape(self):
        regional_councillor_seat_number = 1

        page = self.lxmlize(COUNCIL_PAGE)

        councillors = page.xpath('//div[@id="WebPartWPQ3"]//ul[@class="dfwp-list"][1]/li/div/div/a')
        assert len(councillors), 'No councillors found'
        for councillor in councillors:
            url = councillor.attrib['href']
            page = self.lxmlize(url)

            title = page.xpath('//div[@class="PL_Title"]')[0].text_content()
            if "Councillor" in title:
                district, name = re.split(r'Councillor', title)
                role = 'Councillor'
                if "Regional" in district:
                    role = 'Regional Councillor'
                    district = "Vaughan (seat {})".format(regional_councillor_seat_number)
                    regional_councillor_seat_number += 1
            else:
                name = re.search(r'Mayor ([^,]+)', page.xpath('//meta[@name="keywords"]/@content')[0]).group(1)
                district = 'Vaughan'
                role = 'Mayor'
            name = name.strip()

            if role == 'Mayor':
                detail = self.lxmlize(page.xpath('//a[contains(@href,"/Contact-the-Mayor")]/@href')[0])
                contact_info = detail.xpath('//div[@id="ctl00_PlaceHolderMain_RichHtmlField1__ControlWrapper_RichHtmlField"]')[0]
            else:
                contact_node = page.xpath('//div[@id="WebPartWPQ2"][contains(., "Phone")]')
                if contact_node:
                    contact_info = contact_node[0]
                else:
                    contact_info = page.xpath('//div[@id="WebPartWPQ3"]')[0]

            phone = re.findall(r'[0-9]{3}-[0-9]{3}-[0-9]{4} ext\. [0-9]{4}', contact_info.text_content())[0].replace('ext. ', 'x')
            fax = re.findall(r'[0-9]{3}-[0-9]{3}-[0-9]{4}', contact_info.text_content())[1]
            email = self.get_email(contact_info)

            p = Person(primary_org='legislature', name=name, district=district.strip(), role=role)
            p.add_source(COUNCIL_PAGE)
            p.add_source(url)
            p.add_contact('voice', phone, 'legislature')
            p.add_contact('fax', fax, 'legislature')
            p.add_contact('email', email)

            image = page.xpath('//img[contains(@alt, "Councillor")]/@src')
            if image:
                p.image = image[0]

            if page.xpath('.//a[contains(@href,"facebook")]'):
                p.add_link(page.xpath('.//a[contains(@href,"facebook")]')[0].attrib['href'])
            if page.xpath('.//a[contains(@href,"twitter")]'):
                p.add_link(page.xpath('.//a[contains(@href,"twitter")]')[0].attrib['href'])
            if page.xpath('.//a[contains(@href,"youtube")]'):
                p.add_link(page.xpath('.//a[contains(@href, "youtube")]')[0].attrib['href'])
            yield p
Example #24
0
    def scrape(self):
        page = self.lxmlize(COUNCIL_PAGE, user_agent=CUSTOM_USER_AGENT)
        councillors = page.xpath(
            '//section[contains(@class,"avia-team-member")]')[:-1]
        assert len(councillors), 'No councillors found'

        for councillor in councillors:
            name = councillor.xpath('.//h3/text()')[0]

            if councillor.xpath(
                    './/div[contains(@class,"team-member-job-title")][contains(.,"Maire")]/text()'
            ):
                role = 'Maire'
                district = 'Côte-Saint-Luc'
            else:
                role, district = councillor.xpath(
                    './/div[contains(@class,"team-member-job-title")]/text()'
                )[0].split(',', 1)
                if role == 'Conseillère':
                    role = 'Conseiller'

            image = councillor.xpath('.//img/@src')[0]
            twitter = councillor.xpath('.//p[contains(.,"Twitter")]/a/text()')
            web = councillor.xpath('.//p[contains(.,"Web")]/a/@href')
            blog = councillor.xpath('.//p[contains(.,"Blog")]/a/@href')

            p = Person(primary_org='legislature',
                       name=name,
                       district=district,
                       role=role)
            p.add_source(COUNCIL_PAGE)

            p.add_contact('email', self.get_email(councillor))
            p.add_contact('voice', self.get_phone(councillor,
                                                  area_codes=[514]),
                          'legislature')
            p.image = image
            if twitter:
                p.add_link(twitter[0])
            if web:
                p.add_link(web[0])
            if blog:
                p.add_link(blog[0])

            yield p
Example #25
0
    def scrape_people(self, rows, gender):
        assert len(rows), 'No members found'
        for row in rows:
            name = row.xpath(
                './/div[@class="ce-mip-mp-name"][1]')[0].text_content()
            constituency = row.xpath(
                './/div[@class="ce-mip-mp-constituency"][1]')[0].text_content(
                )
            constituency = constituency.replace('–', '—')  # n-dash, m-dash
            if constituency == 'Mont-Royal':
                constituency = 'Mount Royal'

            province = row.xpath(
                './/div[@class="ce-mip-mp-province"][1]')[0].text_content()

            party = row.xpath(
                './/div[@class="ce-mip-mp-party"][1]')[0].text_content()

            url = row.xpath('.//a[@class="ce-mip-mp-tile"]/@href')[0]

            if province == 'Québec':
                url = url.replace('/en/', '/fr/')

            mp_page = self.lxmlize(url)
            email = self.get_email(mp_page,
                                   '//*[@id="contact"]/div/p/a',
                                   error=False)

            photo = mp_page.xpath(
                './/div[@class="ce-mip-mp-profile-container"]//img/@src')[0]

            m = Person(primary_org='lower',
                       name=name,
                       district=constituency,
                       role='MP',
                       party=party)
            m.add_source(COUNCIL_PAGE)
            m.add_source(url)
            m.gender = gender
            # @see https://www.ourcommons.ca/Members/en/ziad-aboultaif(89156)
            if email:
                m.add_contact('email', email)

            if photo:
                # Determine whether the photo is actually a generic silhouette
                photo_response = self.get(photo)
                if (photo_response.status_code == 200
                        and hashlib.sha1(photo_response.content).hexdigest()
                        not in IMAGE_PLACEHOLDER_SHA1):
                    m.image = photo

            # I don't think the new parliment website has personal website anymore
            personal_url = mp_page.xpath(
                './/a[contains(@title, "Personal Web Site")]/@href')
            if personal_url:
                m.add_link(personal_url[0])

            preferred_languages = mp_page.xpath(
                './/dt[contains(., "Preferred Language")]/following-sibling::dd/text()'
            )
            if preferred_languages:
                m.extras['preferred_languages'] = [
                    language.replace('/', '').strip()
                    for language in preferred_languages
                ]

            if province == 'Québec':
                m.add_contact('address',
                              'Chambre des communes\nOttawa ON  K1A 0A6',
                              'legislature')
            else:
                m.add_contact('address',
                              'House of Commons\nOttawa ON  K1A 0A6',
                              'legislature')

            # Hill Office contacts
            # Now phone and fax are in the same element
            # <p>
            #   Telephone: xxx-xxx-xxxx<br/>
            #   Fax: xxx-xxx-xxx
            # </p>
            phone_and_fax_el = mp_page.xpath(
                './/h4[contains(., "Hill Office")]/../p[contains(., "Telephone")]|.//h4[contains(., "Hill Office")]/../p[contains(., "Téléphone :")]'
            )
            if len(phone_and_fax_el):
                phone_and_fax = phone_and_fax_el[0].text_content().strip(
                ).splitlines()
                voice = phone_and_fax[0].replace('Telephone:', '').replace(
                    'Téléphone :', '').strip()
                fax = phone_and_fax[1].replace('Fax:', '').replace(
                    'Télécopieur :', '').strip()
                if voice:
                    m.add_contact('voice', voice, 'legislature')

                if fax:
                    m.add_contact('fax', fax, 'legislature')

            # Constituency Office contacts
            # Some people has more than one, e.g. https://www.ourcommons.ca/Members/en/ben-lobb(35600)#contact
            for i, constituency_office_el in enumerate(
                    mp_page.xpath(
                        './/div[@class="ce-mip-contact-constituency-office-container"]/div'
                    )):
                note = 'constituency'
                if i:
                    note += ' ({})'.format(i + 1)

                address = constituency_office_el.xpath('./p[1]')[0]
                address = address.text_content().strip().splitlines()
                address = list(map(str.strip, address))
                m.add_contact('address', '\n'.join(address), note)

                phone_and_fax_el = constituency_office_el.xpath(
                    './p[contains(., "Telephone")]|./p[contains(., "Téléphone")]'
                )
                if len(phone_and_fax_el):
                    phone_and_fax = phone_and_fax_el[0].text_content().strip(
                    ).splitlines()
                    # Note that https://www.ourcommons.ca/Members/en/michael-barrett(102275)#contact
                    # has a empty value - "Telephone:". So the search / replace cannot include space.
                    voice = phone_and_fax[0].replace('Telephone:', '').replace(
                        'Téléphone :', '').strip()
                    if len(phone_and_fax) > 1:
                        fax = phone_and_fax[1].replace('Fax:', '').replace(
                            'Télécopieur :', '').strip()

                    if voice:
                        m.add_contact('voice', voice, note)

                    if fax:
                        m.add_contact('fax', fax, note)

            yield m
Example #26
0
    def scrape(self):
        page = self.lxmlize(COUNCIL_PAGE)
        types = page.xpath(
            '//div[@class="bluearrow shaded bottomborder "][1]/ul/li/a/@href'
        )[:4]
        for org_type, link in enumerate(types):
            page = self.lxmlize(link)
            district_urls = page.xpath(
                '//div[@class="parbase list section cplist"]/table/tr/td[1]/b/a/@href'
            )
            for district_url in district_urls:
                page = self.lxmlize(district_url)
                district = page.xpath('//div[@class="pageHeader"]/h1/text()'
                                      )[0].split(' - ')[1].strip()

                org = Organization(
                    name=district + org_types[org_type],
                    classification='legislature',
                    jurisdiction_id=self.jurisdiction.jurisdiction_id)
                org.add_source(district_url)
                yield org

                address = ', '.join(
                    page.xpath('//div[@class="left_contents"]/p[1]/text()'))
                contacts = page.xpath(
                    '//div[@class="left_contents"]/p[b[text() = "Contact"]]/text()'
                )
                phone = contacts[0].split(':')[1].strip().replace(' ', '-')
                fax = contacts[1].split(':')[1].strip().replace(' ', '-')
                email = self.get_email(page, '//div[@class="left_contents"]')

                site = page.xpath(
                    '//div[@class="left_contents"]//a[not(contains(@href,"mailto:"))]'
                )
                if site:
                    site = site[0].text_content()

                councillors = page.xpath(
                    '//div[@class="right_contents"]//p/text()')
                for i, councillor in enumerate(councillors):
                    if 'Vacant' in councillor:
                        continue
                    p = Person(primary_org='legislature',
                               name=councillor,
                               district=district)
                    p.add_source(COUNCIL_PAGE)
                    p.add_source(link)
                    p.add_source(district_url)

                    if i == 0:
                        membership = p.add_membership(org, role='Mayor')
                    else:
                        membership = p.add_membership(org, role='Councillor')

                    membership.post_id = district
                    membership.add_contact_detail('address', address,
                                                  'legislature')
                    if phone:
                        membership.add_contact_detail('voice', phone,
                                                      'legislature')
                    if fax:
                        membership.add_contact_detail('fax', fax,
                                                      'legislature')
                    if email:
                        membership.add_contact_detail('email', email)
                    if site:
                        p.add_link(site)
                    yield p
Example #27
0
    def scrape(self):
        page = self.lxmlize(COUNCIL_PAGE)

        members = page.xpath('//table[@id="MLAs"]//tr')[1:]
        assert len(members), 'No members found'
        for member in members:
            if 'Vacant' not in member.xpath('./td')[0].text_content():
                name = member.xpath('./td')[0].text_content().split('. ', 1)[1]
                party = member.xpath('./td')[1].text
                district = member.xpath('./td')[2].text_content()
                url = member.xpath('./td[1]/a/@href')[0]
                page = self.lxmlize(url)

                p = Person(primary_org='legislature',
                           name=name,
                           district=district,
                           role='MLA',
                           party=party)
                p.add_source(COUNCIL_PAGE)
                p.add_source(url)
                p.image = page.xpath(
                    '//div[contains(@class, "mla-image-cell")]/img/@src')[0]

                contact = page.xpath('//div[@id="mla-contact"]/div[2]')[0]
                website = contact.xpath('./div[3]/div[3]/div[2]/a')
                if website:
                    p.add_link(website[0].text_content())

                def handle_address(lines, address_type):
                    address_lines = []
                    for line in lines:
                        if line.endswith(':'):  # Room:, Phone:, Fax:
                            break
                        address_lines.append(line)
                    if address_lines:
                        p.add_contact(
                            'address',
                            ' '.join(address_lines),
                            address_type,
                        )

                def handle_phone(lines, phone_type):
                    if 'Phone:' in lines:
                        next_line = lines[lines.index('Phone:') + 1]
                        if next_line.endswith(':'):
                            return
                        number = None
                        if '/' in next_line:
                            for fragment in next_line.split('/'):
                                if fragment.strip().startswith('306-'):
                                    number = fragment.strip()
                                    break
                        else:
                            number = next_line
                        p.add_contact('voice',
                                      number,
                                      phone_type,
                                      area_code=306)

                legislature_lines = contact.xpath(
                    './/div[@class="col-md-4"][1]/div//text()')
                assert (legislature_lines[0] == 'Legislative Building Address')
                handle_address(legislature_lines[1:], 'legislature')
                handle_phone(legislature_lines[1:], 'legislature')

                constituency_lines = contact.xpath(
                    './/div[@class="col-md-4"][2]/div//text()')
                assert (constituency_lines[0] == 'Constituency Address')
                handle_address(constituency_lines[1:], 'constituency')
                handle_phone(constituency_lines[1:], 'constituency')

                email = self.get_email(contact, error=False)
                if email:
                    p.add_contact('email', email)

                yield p
Example #28
0
    def scrape(self):
        exclude_divisions = {}
        exclude_districts = {
            'Capital',
            'Capital F',
            'Capital G',
            'Capital H',
            'Central Coast B',
            'Central Okanagan East',
            'Central Okanagan West',
            'Comox Valley B',
            'Comox Valley C',
            'Islands Trust',
            'Kitimat-Stikine C',
            'Kootenay Boundary B',
            'Kootenay Boundary C',
            'Kootenay Boundary D',
            'Kootenay Boundary E',
            'Metro Vancouver A',
            'North Coast A',
            'North Coast C',
            'North Coast D',
            'North Coast E',
            'Okanagan-Similkameen I',
            'Okanagan-Similkameen Olalla Local Community Commission',
            'Qathet A',
            'Qathet B',
            'Qathet C',
            'Qathet D',
            'Qathet E',
        }
        expected_roles = {
            'candidate',
        }
        infixes = {
            'CY': 'City',
            'DM': 'District',
            'IGD': 'District',
            'IM': 'Municipal',
            'RGM': 'Regional',
            'T': 'Town',
            'VL': 'Village',
            'RDA': 'District',
        }
        duplicate_names = {
            'Rick Smith',
            'Sung Y Wong',
            'Elizabeth Taylor',
        }

        names_to_ids = {}
        for division in Division.get('ocd-division/country:ca').children(
                'csd'):
            type_id = division.id.rsplit(':', 1)[1]
            if type_id.startswith('59'):
                if division.attrs['classification'] == 'IRI':
                    continue
                if division.name in names_to_ids:
                    names_to_ids[division.name] = None
                else:
                    names_to_ids[division.name] = division.id

        reader = self.csv_reader(COUNCIL_PAGE, header=True)
        reader.fieldnames = [field.lower() for field in reader.fieldnames]

        organizations = {}

        birth_date = 1900
        seen = set()

        for row in reader:
            name = row['full name']
            district_name = row['district name']

            if not any(row.values()) or name.lower() in (
                    '', 'vacant') or district_name in exclude_districts:
                continue

            if row['district id']:
                division_id = 'ocd-division/country:ca/csd:{}'.format(
                    row['district id'])
            else:
                division_id = names_to_ids[row['district name']]

            if division_id in exclude_divisions:
                continue
            if not division_id:
                raise Exception('unhandled collision: {}'.format(
                    row['district name']))

            division = Division.get(division_id)

            division_name = division.name

            organization_name = '{} {} Council'.format(
                division_name, infixes[division.attrs['classification']])

            if division_id not in seen:
                seen.add(division_id)
                organizations[division_id] = Organization(
                    name=organization_name, classification='government')
                organizations[division_id].add_source(COUNCIL_PAGE)

            organization = organizations[division_id]

            role = row['primary role']
            if role not in expected_roles:
                raise Exception('unexpected role: {}'.format(role))
            if row['district id']:
                district = format(division_id)
            else:
                district = division_name

            organization.add_post(role=role,
                                  label=district,
                                  division_id=division_id)

            p = Person(primary_org='government',
                       primary_org_name=organization_name,
                       name=name,
                       district=district,
                       role=role)
            p.add_source(COUNCIL_PAGE)
            if row['source url']:
                p.add_source(row['source url'])

            if name in duplicate_names:
                p.birth_date = str(birth_date)
                birth_date += 1

            if row['email']:
                p.add_contact('email', row['email'])

            if row['phone']:
                p.add_contact('voice', row['phone'], 'legislature')

            if row['twitter']:
                p.add_link(row['twitter'])

            p._related[0].extras[
                'boundary_url'] = '/boundaries/census-subdivisions/{}/'.format(
                    division_id.rsplit(':', 1)[1])

            yield p

        for organization in organizations.values():
            yield organization
Example #29
0
    def scrape(self):
        page = self.lxmlize(COUNCIL_PAGE)
        members = page.xpath('//*[@id="ListeDeputes"]/tbody/tr')

        assert len(members), 'No members found'
        for row in members:
            name_comma, division = [cell.text_content() for cell in row[:2]]

            name = ' '.join(reversed(name_comma.strip().split(',')))

            division = division.replace('–', '-')  # n-dash, hyphen

            party = row[2].text_content().strip()
            if party == 'Indépendante':
                party = 'Indépendant'

            email = self.get_email(row[3], error=False)

            detail_url = row[0][0].attrib['href']
            detail_page = self.lxmlize(detail_url)

            contact_url = detail_url.replace('index.html', 'coordonnees.html')
            contact_page = self.lxmlize(contact_url)

            photo_url = detail_page.xpath('//img[@class="photoDepute"]/@src')

            p = Person(primary_org='legislature',
                       name=name,
                       district=division,
                       role='MNA',
                       party=party)
            p.add_source(COUNCIL_PAGE)
            p.add_source(detail_url)

            if photo_url:
                p.image = photo_url[0]
            if email:
                p.add_contact('email', email)

            identifier = re.search(r'/([^/]+)/index.html', detail_url).group(1)
            facebook, twitter = SOCIAL_MEDIA_DATA.get(identifier, ('', ''))
            if facebook:
                p.add_link(facebook)
            if twitter:
                p.add_link(twitter)

            for div in contact_page.xpath(
                    '//div[@class="blockAdresseDepute"]'):
                try:
                    phone = self.get_phone(div)
                    heading = div.find('h3').text
                except Exception:
                    pass  # probably just no phone number present
                else:
                    try:
                        note = {
                            'Circonscription': 'constituency',
                            'Parlement': 'legislature',
                            'Ministère': 'legislature',
                        }[heading]
                    except KeyError:
                        raise  # scraper should be updated to handle new value
                    else:
                        p.add_contact('voice', phone, note)
            yield p
Example #30
0
    def scrape(self):
        page = self.lxmlize(COUNCIL_PAGE)
        rows = page.xpath('//div[@class="content-primary"]//tr')[1:]
        assert len(rows), 'No members found'
        for row in rows:
            name_cell = row.xpath('./td[1]')[0]
            last_name = name_cell.xpath('.//span[1]//text()')[0]
            first_name = name_cell.xpath('.//span[2]//text()')[0]
            name = '{} {}'.format(first_name, last_name)
            constituency = row.xpath('./td[2]//text()')[0].replace('–', '—')  # n-dash, m-dash
            if constituency == 'Mont-Royal':
                constituency = 'Mount Royal'
            province = row.xpath('./td[3]//text()')[0]
            party = row.xpath('string(./td[4])')  # allow string()
            url = name_cell.xpath('.//a/@href')[0]
            if province == 'Québec':
                url = url.replace('/en/', '/fr/')

            mp_page = self.lxmlize(url)
            email = self.get_email(mp_page, '//span[@class="caucus"]', error=False)
            photo = mp_page.xpath('//div[@class="profile overview header"]//img/@src')[0]

            m = Person(primary_org='lower', name=name, district=constituency, role='MP', party=party)
            m.add_source(COUNCIL_PAGE)
            m.add_source(url)
            # @see http://www.parl.gc.ca/Parliamentarians/en/members/David-Yurdiga%2886260%29
            if email:
                m.add_contact('email', email)
            elif name == 'Adam Vaughan':
                m.add_contact('email', '*****@*****.**')

            if photo:
                # Determine whether the photo is actually a generic silhouette
                photo_response = self.get(photo)
                if (photo_response.status_code == 200 and hashlib.sha1(photo_response.content).hexdigest() not in IMAGE_PLACEHOLDER_SHA1):
                    m.image = photo

            personal_url = mp_page.xpath('//a[contains(@title, "Personal Web Site")]/@href')
            if personal_url:
                m.add_link(personal_url[0])

            preferred_languages = mp_page.xpath('//span[@class="label"][contains(., "Preferred Language")]/following-sibling::span[@class="constituency"]/text()')
            if preferred_languages:
                m.extras['preferred_languages'] = [language.replace('/', '').strip() for language in preferred_languages]

            if province == 'Québec':
                m.add_contact('address', 'Chambre des communes\nOttawa ON  K1A 0A6', 'legislature')
            else:
                m.add_contact('address', 'House of Commons\nOttawa ON  K1A 0A6', 'legislature')
            voice = mp_page.xpath('//div[@class="hilloffice"]//span//text()[contains(., "Telephone:")]|//div[@class="hilloffice"]//span//text()[contains(., "Téléphone :")]')[0].replace('Telephone: ', '').replace('Téléphone : ', '')
            if voice:
                m.add_contact('voice', voice, 'legislature')
            fax = mp_page.xpath('//div[@class="hilloffice"]//span//text()[contains(., "Fax:")]|//div[@class="hilloffice"]//span//text()[contains(., "Télécopieur :")]')[0].replace('Fax: ', '').replace('Télécopieur : ', '')
            if fax:
                m.add_contact('fax', fax, 'legislature')

            for i, li in enumerate(mp_page.xpath('//div[@class="constituencyoffices"]//li')):
                spans = li.xpath('./span[not(@class="spacer")]')
                note = 'constituency'
                if i:
                    note += ' ({})'.format(i + 1)
                m.add_contact('address', '\n'.join([
                    spans[0].text_content(),  # address line 1
                    spans[1].text_content(),  # address line 2
                    spans[2].text_content(),  # city, region
                    spans[3].text_content(),  # postal code
                ]), note)
                voice = li.xpath('./span//text()[contains(., "Telephone:")]|./span//text()[contains(., "Téléphone :")]')
                if voice:
                    voice = voice[0].replace('Telephone: ', '').replace('Téléphone : ', '')
                    if voice:
                        m.add_contact('voice', voice, note)
                fax = li.xpath('./span//text()[contains(., "Fax:")]|./span//text()[contains(., "Télécopieur :")]')
                if fax:
                    fax = fax[0].replace('Fax: ', '').replace('Télécopieur : ', '')
                    if fax:
                        m.add_contact('fax', fax, note)

            yield m
Example #31
0
    def scrape(self):
        exclude_divisions = {
            'ocd-division/country:ca/csd:1301006',  # Saint John
            'ocd-division/country:ca/csd:1307022',  # Moncton
            'ocd-division/country:ca/csd:1310032',  # Fredericton
        }
        expected_roles = {
            'Mayor',
            'Councillor',
        }
        unique_roles = {
            'Mayor',
        }
        classifications = {
            'Cities': 'City',
            'Towns': 'Town',
            'Villages': 'Village',
            'Rural Communities': 'Community',
            'Regional Municipality': 'Regional',
        }
        corrections = {
            'Beaubassin-est/East': 'Beaubassin East',
            'Lac-Baker': 'Lac Baker',
            'Saint-François-de-Madawaska': 'Saint-François de Madawaska',
            'Saint-Hilaire': 'Saint Hilaire',
        }
        unknown_names = {
            'Haut-Madawaska',  # incorporated after Census 2016
        }
        duplicate_names = {
            'Denis Savoie',
            'Josée Levesque',
            'Luc Levesque',
        }

        names_to_ids = {}
        for division in Division.get('ocd-division/country:ca').children(
                'csd'):
            type_id = division.id.rsplit(':', 1)[1]
            if type_id.startswith('13'):
                if division.attrs['classification'] == 'P':
                    continue
                if division.name in names_to_ids:
                    raise Exception('unhandled collision: {}'.format(
                        division.name))
                else:
                    names_to_ids[division.name] = division.id

        page = self.lxmlize(COUNCIL_PAGE)
        list_links = page.xpath(
            '//div[@id="sidebar"]//div[contains(@class, "list")][1]//a')

        birth_date = 1900
        seen = set()

        assert len(list_links), 'No list items found'
        for list_link in list_links:
            page = self.lxmlize(list_link.attrib['href'])
            detail_urls = page.xpath('//td[1]//@href')

            assert len(detail_urls), 'No municipalities found'
            for detail_url in detail_urls:
                page = self.lxmlize(detail_url, encoding='utf-8')
                division_name = re.sub(
                    r'\ASt\b\.?', 'Saint',
                    page.xpath('//h1/text()')[0].split(' - ', 1)[1])
                division_name = corrections.get(division_name, division_name)

                if division_name in unknown_names:
                    continue
                division_id = names_to_ids[division_name]
                if division_id in exclude_divisions:
                    continue
                if division_id in seen:
                    raise Exception(
                        'unhandled collision: {}'.format(division_id))

                seen.add(division_id)
                division_name = Division.get(division_id).name
                organization_name = '{} {} Council'.format(
                    division_name, classifications[list_link.text])
                organization = Organization(name=organization_name,
                                            classification='government')
                organization.add_source(detail_url)

                address = ', '.join(
                    page.xpath('//div[@class="left_contents"]/p[1]/text()'))

                contacts = page.xpath(
                    '//div[@class="left_contents"]/p[contains(., "Contact")]/text()'
                )
                phone = contacts[0].split(':')[1]
                if len(contacts) > 1:
                    fax = contacts[1].split(':')[1]
                email = self.get_email(page,
                                       '//div[@class="left_contents"]',
                                       error=False)

                url = page.xpath(
                    '//div[@class="left_contents"]//@href[not(contains(., "mailto:"))]'
                )
                if url:
                    url = url[0]

                groups = page.xpath(
                    '//div[contains(@class, "right_contents")]/p')
                assert len(groups), 'No groups found'
                for p in groups:
                    role = p.xpath('./b/text()')[0].rstrip('s')
                    if role not in expected_roles:
                        raise Exception('unexpected role: {}'.format(role))

                    councillors = p.xpath('./text()')
                    assert len(councillors), 'No councillors found'
                    for seat_number, name in enumerate(councillors, 1):
                        if 'vacant' in name.lower():
                            continue

                        if role in unique_roles:
                            district = division_name
                        else:
                            district = '{} (seat {})'.format(
                                division_name, seat_number)

                        organization.add_post(role=role,
                                              label=district,
                                              division_id=division_id)

                        p = Person(primary_org='government',
                                   primary_org_name=organization_name,
                                   name=name,
                                   district=district,
                                   role=role)
                        p.add_source(COUNCIL_PAGE)
                        p.add_source(list_link.attrib['href'])
                        p.add_source(detail_url)

                        if name in duplicate_names:
                            p.birth_date = str(birth_date)
                            birth_date += 1

                        p.add_contact('address', address, 'legislature')
                        # @see https://en.wikipedia.org/wiki/Area_code_506
                        if phone:
                            p.add_contact('voice',
                                          phone,
                                          'legislature',
                                          area_code=506)
                        if fax:
                            p.add_contact('fax',
                                          fax,
                                          'legislature',
                                          area_code=506)
                        if email:
                            p.add_contact('email', email)
                        if url:
                            p.add_link(url)

                        p._related[0].extras[
                            'boundary_url'] = '/boundaries/census-subdivisions/{}/'.format(
                                division_id.rsplit(':', 1)[1])

                        yield p

                yield organization
Example #32
0
    def scrape(self):
        exclude_divisions = {
        }
        exclude_districts = {
            'Capital',
            'Capital F',
            'Capital G',
            'Capital H',
            'Central Coast B',
            'Central Okanagan East',
            'Central Okanagan West',
            'Comox Valley B',
            'Comox Valley C',
            'Islands Trust',
            'Kitimat-Stikine C',
            'Kootenay Boundary B',
            'Kootenay Boundary C',
            'Kootenay Boundary D',
            'Kootenay Boundary E',
            'Metro Vancouver A',
            'North Coast A',
            'North Coast C',
            'North Coast D',
            'North Coast E',
            'Okanagan-Similkameen I',
            'Okanagan-Similkameen Olalla Local Community Commission',
            'Qathet A',
            'Qathet B',
            'Qathet C',
            'Qathet D',
            'Qathet E',
        }
        expected_roles = {
            'candidate',
        }
        infixes = {
            'CY': 'City',
            'DM': 'District',
            'IGD': 'District',
            'IM': 'Municipal',
            'RGM': 'Regional',
            'T': 'Town',
            'VL': 'Village',
            'RDA': 'District',
        }
        duplicate_names = {
            'Rick Smith',
            'Sung Y Wong',
            'Elizabeth Taylor',
        }

        names_to_ids = {}
        for division in Division.get('ocd-division/country:ca').children('csd'):
            type_id = division.id.rsplit(':', 1)[1]
            if type_id.startswith('59'):
                if division.attrs['classification'] == 'IRI':
                    continue
                if division.name in names_to_ids:
                    names_to_ids[division.name] = None
                else:
                    names_to_ids[division.name] = division.id

        reader = self.csv_reader(COUNCIL_PAGE, header=True)
        reader.fieldnames = [field.lower() for field in reader.fieldnames]

        organizations = {}

        birth_date = 1900
        seen = set()

        rows = [row for row in reader]
        assert len(rows), 'No councillors found'
        for row in rows:
            name = row['full name']
            district_name = row['district name']

            if not any(row.values()) or name.lower() in ('', 'vacant') or district_name in exclude_districts:
                continue

            if row['district id']:
                division_id = 'ocd-division/country:ca/csd:{}'.format(row['district id'])
            else:
                division_id = names_to_ids[row['district name']]

            if division_id in exclude_divisions:
                continue
            if not division_id:
                raise Exception('unhandled collision: {}'.format(row['district name']))

            division = Division.get(division_id)

            division_name = division.name

            organization_name = '{} {} Council'.format(division_name, infixes[division.attrs['classification']])

            if division_id not in seen:
                seen.add(division_id)
                organizations[division_id] = Organization(name=organization_name, classification='government')
                organizations[division_id].add_source(COUNCIL_PAGE)

            organization = organizations[division_id]

            role = row['primary role']
            if role not in expected_roles:
                raise Exception('unexpected role: {}'.format(role))
            if row['district id']:
                district = format(division_id)
            else:
                district = division_name

            organization.add_post(role=role, label=district, division_id=division_id)

            p = Person(primary_org='government', primary_org_name=organization_name, name=name, district=district, role=role)
            p.add_source(COUNCIL_PAGE)
            if row['source url']:
                p.add_source(row['source url'])

            if name in duplicate_names:
                p.birth_date = str(birth_date)
                birth_date += 1

            if row['email']:
                p.add_contact('email', row['email'])

            if row['phone']:
                p.add_contact('voice', row['phone'], 'legislature')

            if row['twitter']:
                p.add_link(row['twitter'])

            p._related[0].extras['boundary_url'] = '/boundaries/census-subdivisions/{}/'.format(division_id.rsplit(':', 1)[1])

            yield p

        for organization in organizations.values():
            yield organization
Example #33
0
    def scrape(self):
        response = urlopen(COUNCIL_PAGE).read()
        pdf = open('/tmp/yt.pdf', 'w')
        pdf.write(response)
        pdf.close()

        data = subprocess.check_output(
            ['pdftotext', '-layout', '/tmp/yt.pdf', '-'])
        data = re.split(r'\n\s*\n', data)
        for municipality in data:

            if 'Councillors' not in municipality:
                continue
            lines = municipality.split('\n')
            if 'Page' in lines[0]:
                lines.pop(0)
                if not lines[0].strip():
                    lines.pop(0)
            col1end = re.search(r'\s{2,}(\w)', lines[0].strip()).end()
            col2end = re.search(r':\s{2,}(\w)', lines[0].strip()).end()

            if 'Council' in lines[1]:
                address = lines[2][:col1end -
                                   1].strip() + ' ' + lines[3][:col1end -
                                                               1].strip()
                district = lines[0][:col1end -
                                    1].strip() + ' ' + lines[1][:col1end -
                                                                1].strip()
            else:
                address = lines[1][:col1end -
                                   1].strip() + ' ' + lines[2][:col1end -
                                                               1].strip()
                district = lines[0][:col1end - 1].strip()

            organization = Organization(
                name=district + ' Council',
                classification='legislature',
                jurisdiction_id=self.jurisdiction.jurisdiction_id)
            organization.add_source(COUNCIL_PAGE)
            yield organization

            phone = re.findall(r'(?<=Phone: )\(?(\d{3}[\)-] ?\d{3}-\d{4})',
                               municipality)[0].replace(') ', '-')
            email = re.findall(r'(?<=E-mail:) (\S*)', municipality)[0]
            fax = None
            if 'Fax' in municipality:
                fax = re.findall(r'(?<=Fax: )\(?(\d{3}[\)-] ?\d{3}-\d{4})',
                                 municipality)[0].replace(') ', '-')
            website = None
            if 'Website' in municipality:
                website = re.findall(r'((http:\/\/|www.)(\S*))',
                                     municipality)[0][0]

            councillor_or_mayor = False
            for line in lines:
                if 'Mayor:' in line:
                    councillor_or_mayor = True
                    role = 'Mayor'
                    continue
                if 'Councillors' in line:
                    councillor_or_mayor = True
                    role = 'Councillor'
                    continue
                if councillor_or_mayor:
                    councillor = line[col1end - 1:col2end - 1].strip()
                    if not councillor:
                        continue
                    p = Person(primary_org='legislature',
                               name=councillor,
                               district=district)
                    p.add_source(COUNCIL_PAGE)
                    membership = p.add_membership(organization,
                                                  role=role,
                                                  district=district)
                    membership.add_contact_detail('address', address,
                                                  'legislature')
                    membership.add_contact_detail('voice', phone,
                                                  'legislature')
                    membership.add_contact_detail('email', email)
                    if fax:
                        membership.add_contact_detail('fax', fax,
                                                      'legislature')
                    if website:
                        p.add_link(website)
                    yield p

        os.system('rm /tmp/yt.pdf')
Example #34
0
    def scrape(self):
        organizations = {}
        seat_numbers = defaultdict(lambda: defaultdict(int))

        reader = self.csv_reader(self.csv_url,
                                 delimiter=self.delimiter,
                                 header=True,
                                 encoding=self.encoding,
                                 skip_rows=self.skip_rows)
        reader.fieldnames = [
            self.header_converter(field) for field in reader.fieldnames
        ]
        for row in reader:

            try:
                if self.is_valid_row(row):
                    for key, corrections in self.corrections.items():
                        if not isinstance(corrections, dict):
                            row[key] = corrections(row[key])
                        elif row[key] in corrections:
                            row[key] = corrections[row[key]]

                    organization_classification = 'legislature'

                    organization_name = row['organization']
                    organization_key = organization_name.lower()
                    if organization_key in organizations:
                        organization = organizations[organization_key]
                    else:
                        organization = Organization(
                            organization_name,
                            classification=organization_classification)
                        organization.add_source(self.csv_url)
                        yield organization
                        organizations[organization_key] = organization

                    if not row['primary role']:
                        row['primary role'] = 'Councillor'

                    role = row['primary role']

                    post = Post(role=role,
                                label=organization_name,
                                organization_id=organization._id)
                    yield post

                    name = row['name'].strip(' .,')

                    district = row['district name']

                    if self.many_posts_per_area and role not in self.unique_roles:
                        seat_numbers[role][district] += 1
                        district = '{} (seat {})'.format(
                            district, seat_numbers[role][district])

                    p = Person(primary_org=organization_classification,
                               name=name,
                               district=district,
                               role=role,
                               party=row.get('party name'))
                    p.add_source(self.csv_url)

                    if row.get('gender'):
                        p.gender = row['gender']
                    if row.get('photo url'):
                        p.image = row['photo url']

                    if row.get('source url'):
                        p.add_source(row['source url'].strip(' .,'))

                    if row.get('website'):
                        p.add_link(row['website'], note='web site')
                    if row.get('facebook'):
                        p.add_link(re.sub(r'[#?].+', '', row['facebook']))
                    if row.get('twitter'):
                        p.add_link(row['twitter'])

                    if row['email']:
                        p.add_contact('email', row['email'].strip(' .,'))
                    if row['address']:
                        p.add_contact('address', row['address'], 'legislature')
                    if row.get('phone'):
                        p.add_contact('voice', row['phone'], 'legislature')
                    if row.get('fax'):
                        p.add_contact('fax', row['fax'], 'legislature')
                    if row.get('cell'):
                        p.add_contact('cell', row['cell'], 'legislature')
                    if row.get('birth date'):
                        p.birth_date = row['birth date']

                    if row.get('incumbent'):
                        p.extras['incumbent'] = row['incumbent']

                    if name in self.other_names:
                        for other_name in self.other_names[name]:
                            p.add_name(other_name)

                    # Validate person entity so that we can catch the exception if needed.
                    p.validate()

                    yield p
            except Exception as e:
                print(repr(e))
                continue
Example #35
0
    def scrape(self):
        response = urlopen(COUNCIL_PAGE).read()
        pdf = open('/tmp/yt.pdf', 'w')
        pdf.write(response)
        pdf.close()

        data = subprocess.check_output(['pdftotext', '-layout', '/tmp/yt.pdf', '-'])
        data = re.split(r'\n\s*\n', data)
        for municipality in data:

            if 'Councillors' not in municipality:
                continue
            lines = municipality.split('\n')
            if 'Page' in lines[0]:
                lines.pop(0)
                if not lines[0].strip():
                    lines.pop(0)
            col1end = re.search(r'\s{2,}(\w)', lines[0].strip()).end()
            col2end = re.search(r':\s{2,}(\w)', lines[0].strip()).end()

            if 'Council' in lines[1]:
                address = lines[2][:col1end - 1].strip() + ' ' + lines[3][:col1end - 1].strip()
                district = lines[0][:col1end - 1].strip() + ' ' + lines[1][:col1end - 1].strip()
            else:
                address = lines[1][:col1end - 1].strip() + ' ' + lines[2][:col1end - 1].strip()
                district = lines[0][:col1end - 1].strip()

            organization = Organization(name=district + ' Council', classification='legislature', jurisdiction_id=self.jurisdiction.jurisdiction_id)
            organization.add_source(COUNCIL_PAGE)
            yield organization

            phone = re.findall(r'(?<=Phone: )\(?(\d{3}[\)-] ?\d{3}-\d{4})', municipality)[0].replace(') ', '-')
            email = re.findall(r'(?<=E-mail:) (\S*)', municipality)[0]
            fax = None
            if 'Fax' in municipality:
                fax = re.findall(r'(?<=Fax: )\(?(\d{3}[\)-] ?\d{3}-\d{4})', municipality)[0].replace(') ', '-')
            website = None
            if 'Website' in municipality:
                website = re.findall(r'((http:\/\/|www.)(\S*))', municipality)[0][0]

            councillor_or_mayor = False
            for line in lines:
                if 'Mayor:' in line:
                    councillor_or_mayor = True
                    role = 'Mayor'
                    continue
                if 'Councillors' in line:
                    councillor_or_mayor = True
                    role = 'Councillor'
                    continue
                if councillor_or_mayor:
                    councillor = line[col1end - 1:col2end - 1].strip()
                    if not councillor:
                        continue
                    p = Person(primary_org='legislature', name=councillor, district=district)
                    p.add_source(COUNCIL_PAGE)
                    membership = p.add_membership(organization, role=role, district=district)
                    membership.add_contact_detail('address', address, 'legislature')
                    membership.add_contact_detail('voice', phone, 'legislature')
                    membership.add_contact_detail('email', email)
                    if fax:
                        membership.add_contact_detail('fax', fax, 'legislature')
                    if website:
                        p.add_link(website)
                    yield p

        os.system('rm /tmp/yt.pdf')
Example #36
0
    def scrape(self):
        exclude_divisions = {
            'ocd-division/country:ca/csd:1301006',  # Saint John
            'ocd-division/country:ca/csd:1307022',  # Moncton
            'ocd-division/country:ca/csd:1310032',  # Fredericton
        }
        expected_roles = {
            'Mayor',
            'Councillor',
        }
        unique_roles = {
            'Mayor',
        }
        classifications = {
            'Cities': 'City',
            'Towns': 'Town',
            'Villages': 'Village',
            'Rural Communities': 'Community',
            'Regional Municipality': 'Regional',
        }
        corrections = {
            'Beaubassin-est/East': 'Beaubassin East',
            'Lac-Baker': 'Lac Baker',
            'Saint-François-de-Madawaska': 'Saint-François de Madawaska',
            'Saint-Hilaire': 'Saint Hilaire',
        }
        unknown_names = {
            'Haut-Madawaska',  # incorporated after Census 2016
        }
        duplicate_names = {
            'Denis Savoie',
            'Josée Levesque',
            'Luc Levesque',
        }

        names_to_ids = {}
        for division in Division.get('ocd-division/country:ca').children('csd'):
            type_id = division.id.rsplit(':', 1)[1]
            if type_id.startswith('13'):
                if division.attrs['classification'] == 'P':
                    continue
                if division.name in names_to_ids:
                    raise Exception('unhandled collision: {}'.format(division.name))
                else:
                    names_to_ids[division.name] = division.id

        page = self.lxmlize(COUNCIL_PAGE)
        list_links = page.xpath('//div[@id="sidebar"]//div[contains(@class, "list")][1]//a')

        birth_date = 1900
        seen = set()

        assert len(list_links), 'No list items found'
        for list_link in list_links:
            page = self.lxmlize(list_link.attrib['href'])
            detail_urls = page.xpath('//td[1]//@href')

            assert len(detail_urls), 'No municipalities found'
            for detail_url in detail_urls:
                page = self.lxmlize(detail_url, encoding='utf-8')
                division_name = re.sub(r'\ASt\b\.?', 'Saint', page.xpath('//h1/text()')[0].split(' - ', 1)[1])
                division_name = corrections.get(division_name, division_name)

                if division_name in unknown_names:
                    continue
                division_id = names_to_ids[division_name]
                if division_id in exclude_divisions:
                    continue
                if division_id in seen:
                    raise Exception('unhandled collision: {}'.format(division_id))

                seen.add(division_id)
                division_name = Division.get(division_id).name
                organization_name = '{} {} Council'.format(division_name, classifications[list_link.text])
                organization = Organization(name=organization_name, classification='government')
                organization.add_source(detail_url)

                address = ', '.join(page.xpath('//div[@class="left_contents"]/p[1]/text()'))

                contacts = page.xpath('//div[@class="left_contents"]/p[contains(., "Contact")]/text()')
                phone = contacts[0].split(':')[1]
                if len(contacts) > 1:
                    fax = contacts[1].split(':')[1]
                email = self.get_email(page, '//div[@class="left_contents"]', error=False)

                url = page.xpath('//div[@class="left_contents"]//@href[not(contains(., "mailto:"))]')
                if url:
                    url = url[0]

                groups = page.xpath('//div[contains(@class, "right_contents")]/p')
                assert len(groups), 'No groups found'
                for p in groups:
                    role = p.xpath('./b/text()')[0].rstrip('s')
                    if role not in expected_roles:
                        raise Exception('unexpected role: {}'.format(role))

                    councillors = p.xpath('./text()')
                    assert len(councillors), 'No councillors found'
                    for seat_number, name in enumerate(councillors, 1):
                        if 'vacant' in name.lower():
                            continue

                        if role in unique_roles:
                            district = division_name
                        else:
                            district = '{} (seat {})'.format(division_name, seat_number)

                        organization.add_post(role=role, label=district, division_id=division_id)

                        p = Person(primary_org='government', primary_org_name=organization_name, name=name, district=district, role=role)
                        p.add_source(COUNCIL_PAGE)
                        p.add_source(list_link.attrib['href'])
                        p.add_source(detail_url)

                        if name in duplicate_names:
                            p.birth_date = str(birth_date)
                            birth_date += 1

                        p.add_contact('address', address, 'legislature')
                        # @see https://en.wikipedia.org/wiki/Area_code_506
                        if phone:
                            p.add_contact('voice', phone, 'legislature', area_code=506)
                        if fax:
                            p.add_contact('fax', fax, 'legislature', area_code=506)
                        if email:
                            p.add_contact('email', email)
                        if url:
                            p.add_link(url)

                        p._related[0].extras['boundary_url'] = '/boundaries/census-subdivisions/{}/'.format(division_id.rsplit(':', 1)[1])

                        yield p

                yield organization
Example #37
0
    def scrape(self):
        page = self.lxmlize(COUNCIL_PAGE)

        districts = page.xpath(
            '//div[@id="left-content" or @id="right-content"]//a')
        for district in districts:
            url = district.attrib['href']
            page = self.lxmlize(url)

            org = Organization(
                name=district.text_content() + ' Council',
                classification='legislature',
                jurisdiction_id=self.jurisdiction.jurisdiction_id)
            org.add_source(url)
            yield org

            info = page.xpath('//div[@style="WIDTH:750"]/dl')
            for contact in info:
                contact_type = contact.xpath('./dt')[0].text_content()
                contact = contact.xpath('./dd')[0].text_content().replace(
                    '(', '').replace(') ', '-')
                if 'Officials' in contact_type:
                    break
                if 'Tel' in contact_type:
                    phone = contact
                if 'Fac' in contact_type:
                    fax = contact
                if 'Address' in contact_type:
                    address = contact
                if 'Email' in contact_type:
                    email = contact
                if 'Website' in contact_type:
                    site = contact

            councillors = page.xpath(
                '//div[@style="WIDTH:750"]/dl/dt[contains(text(), "Elected Officials")]/parent::dl/dd/pre/text()'
            )[0].splitlines(True)
            for councillor in councillors:
                name = councillor.replace('(Mayor)', '').replace(
                    '(Deputy Mayor)', '').replace('(Chairperson)', '').strip()
                role = re.sub(r'\(|\)', '',
                              councillor.replace(name, '').strip())
                if not role:
                    role = 'Councillor'
                p = Person(primary_org='legislature',
                           name=name,
                           district=district.text_content())
                p.add_source(COUNCIL_PAGE)
                p.add_source(url)
                membership = p.add_membership(org,
                                              role=role,
                                              district=district.text_content())
                membership.add_contact_detail(
                    'voice', self.clean_telephone_number(phone), 'legislature')
                membership.add_contact_detail('fax',
                                              self.clean_telephone_number(fax),
                                              'legislature')
                membership.add_contact_detail('address',
                                              self.clean_address(address),
                                              'legislature')
                membership.add_contact_detail('email', email)
                if site:
                    p.add_link(site)
                yield p
Example #38
0
    def scrape(self):
        page = self.lxmlize(COUNCIL_PAGE)

        councillors = page.xpath('//div[@id="printArea"]//table//tr//td')[4:-1]
        yield self.scrape_mayor(councillors[0])
        for councillor in councillors[1:]:
            name = ' '.join(
                councillor.xpath('.//strong/a[last()]//text()')[0].split())
            infostr = councillor.xpath('.//strong//text()')[0]
            try:
                district = infostr.split('-')[1]
                role = 'Councillor'
            except IndexError:
                district = 'Newmarket'
                role = 'Regional Councillor'
            url = councillor.xpath('.//a/@href')[0]

            p = Person(primary_org='legislature',
                       name=name,
                       district=district,
                       role=role)
            p.add_source(COUNCIL_PAGE)
            p.add_source(url)

            p.image = councillor.xpath('.//img/@src')[0]

            page = self.lxmlize(url)
            info = page.xpath('//div[@id="printArea"]')[0]
            info = info.xpath('.//p[@class="heading"][2]/following-sibling::p')
            address = info.pop(0).text_content().strip()
            if not address:
                address = info.pop(0).text_content().strip()

            if 'Ward' in info[0].text_content():
                info.pop(0)

            numbers = info.pop(0).text_content().split(':')
            email = self.get_email(page)
            p.add_contact('email', email)
            for i, contact in enumerate(numbers):
                if i == 0:
                    continue
                if '@' in contact:
                    continue  # executive assistant email
                else:
                    number = re.findall(r'([0-9]{3}-[0-9]{3}-[0-9]{4})',
                                        contact)[0]
                    ext = re.findall(r'(Ext\. [0-9]{3,4})', contact)
                    if ext:
                        number = number + ext[0].replace('Ext. ', ' x')
                    contact_type = re.findall(r'[A-Za-z]+$', numbers[i - 1])[0]
                if 'Fax' in contact_type:
                    p.add_contact('fax', number, 'legislature')
                elif 'Phone' in contact_type:
                    p.add_contact('voice', number, 'legislature')
                else:
                    p.add_contact(contact_type, number, contact_type)
            site = page.xpath('.//a[contains(text(), "http://")]')
            if site:
                p.add_link(site[0].text_content())
            yield p