Beispiel #1
0
    def scrape(self):
        regional_councillor_seat_number = 1
        page = self.lxmlize(COUNCIL_PAGE)

        yield self.scrape_mayor(page)

        councillor_nodes = page.xpath('//h3[contains(text(), "Councillors")]/following-sibling::p')[:-1]
        for councillor_node in councillor_nodes:
            text = ' '.join(councillor_node.xpath('./strong/text()'))
            if not text or 'Vacant' in text:
                continue

            name, role_district = text.split(', ', 1)

            if 'Regional Councillor' in role_district:
                role = role_district
                district = 'Whitby (seat {})'.format(regional_councillor_seat_number)
                regional_councillor_seat_number += 1
            else:
                role, district = role_district.strip().split(', ')
                district = district.split(' (')[0]

            email = self.get_email(councillor_node)
            image = councillor_node.xpath('./img/@src')[0]
            p = Person(primary_org='legislature', name=name, district=district, role=role, image=image)
            p.add_source(COUNCIL_PAGE)
            p.add_contact('email', email)
            yield p
Beispiel #2
0
    def scrape(self):
        page = self.lxmlize(COUNCIL_PAGE)
        for block in page.xpath('//div[@class="addressblock"]'):
            name_elem = block.xpath('.//a[@class="mpp"]')[0]
            name = ' '.join(name_elem.text.split())

            riding = block.xpath('.//div[@class="riding"]//text()')[0].strip().replace('--', '\u2014')
            district = riding.replace('Chatham—Kent', 'Chatham-Kent')  # m-dash to hyphen
            mpp_url = name_elem.attrib['href']

            mpp_page = self.lxmlize(mpp_url)

            image = mpp_page.xpath('//img[@class="mppimg"]/@src')
            party = mpp_page.xpath('//div[@class="mppinfoblock"]/p[last()]/text()')[0].strip()

            p = Person(primary_org='legislature', name=name, district=district, role='MPP', party=party)
            if image:
                p.image = image[0]
            p.add_source(COUNCIL_PAGE)
            p.add_source(mpp_url)

            email = block.xpath('.//div[@class="email"]')
            if email:
                p.add_contact('email', self.get_email(email[0]))

            phone = block.xpath('.//div[@class="phone"]//text()')
            if phone:
                p.add_contact('voice', phone[0], 'legislature')

            yield p
Beispiel #3
0
    def scrape(self):
        self.user_agent = CUSTOM_USER_AGENT
        page = self.get(COUNCIL_PAGE)
        members = re.findall('/Members/YourMember/[^"]+', page.text)
        assert len(members), 'No members found'
        for member in members:
            detail_url = 'http://www.assembly.nl.ca%s' % member
            detail = self.lxmlize(detail_url, user_agent=CUSTOM_USER_AGENT)

            name = detail.xpath('//h1/text()')[0]
            district = re.sub(r' [\xa0–-] ', '—', detail.xpath('//h2/text()')[0])  # # n-dash, m-dash
            party = PARTIES[detail.xpath('//h3/text()')[0]]

            p = Person(primary_org='legislature', name=name, district=district, role='MHA', party=party)
            p.image = detail.xpath('//img[@class="img-responsive"]/@src')[0]

            contact = detail.xpath('//div[@class="col-md-12"]')[0]
            p.add_contact('email', self.get_email(contact))

            p.add_source(COUNCIL_PAGE)
            p.add_source(detail_url)

            for heading, _type in HEADING_TYPE.items():
                node = detail.xpath('//b[.="%s"]/../..' % heading)
                if node:
                    phone = self.get_phone(node[0], error=False)
                    if phone:
                        p.add_contact('voice', phone, _type)

            yield p
Beispiel #4
0
    def scrape(self):
        page = self.lxmlize(COUNCIL_PAGE)

        councillors = page.xpath('//div[@class="article-content"]//td[@class="ms-rteTableOddCol-0"]')
        yield self.scrape_mayor(councillors[0])
        assert len(councillors), 'No councillors found'
        for councillor in councillors[1:]:
            if not councillor.xpath('.//a'):
                continue

            texts = [text for text in councillor.xpath('.//text()') if clean_string(text)]
            name = texts[0]
            district = texts[1]
            url = councillor.xpath('.//a/@href')[0]
            page = self.lxmlize(url)

            p = Person(primary_org='legislature', name=name, district=district, role='Conseiller')
            p.add_source(COUNCIL_PAGE)
            p.add_source(url)

            p.image = councillor.xpath('./preceding-sibling::td//img/@src')[-1]

            contacts = page.xpath('.//td[@class="ms-rteTableOddCol-0"]//text()')
            for contact in contacts:
                if re.findall(r'[0-9]{4}', contact):
                    phone = contact.strip().replace(' ', '-')
                    p.add_contact('voice', phone, 'legislature')
            get_links(p, page.xpath('.//td[@class="ms-rteTableOddCol-0"]')[0])

            email = self.get_email(page)
            p.add_contact('email', email)
            yield p
Beispiel #5
0
    def scrape(self):
        page = self.lxmlize(COUNCIL_PAGE)
        members = page.xpath('//table[1]//tr')

        assert len(members), 'No members found'
        for member in members:
            if not member.text_content().strip():
                continue

            name = member.xpath('./td[2]//a[1]//text()')[0]

            district_name = member.xpath('./td[2]//a[contains(.//text(), "MLA")]//text()')[0].split(':')[1].replace('St ', 'St. ').split('-')
            district = district_name[0].strip() + '-' + district_name[1].strip()
            url = member.xpath('./td[2]//a[1]/@href')[0]
            ext_infos = self.scrape_extended_info(url)
            p = Person(primary_org='legislature', name=name, district=district, role='MLA')
            p.add_source(COUNCIL_PAGE)
            p.add_source(url)

            if ext_infos:  # member pages might return errors
                email, phone, photo_url = ext_infos
                p.image = photo_url
                if email:
                    p.add_contact('email', email)
                if phone:
                    p.add_contact('voice', phone, 'legislature')
            yield p
Beispiel #6
0
    def scrape(self):
        page = self.lxmlize(COUNCIL_PAGE, 'iso-8859-1')

        general_contacts = page.xpath('//p[@class="large_title"]/following-sibling::p/text()')
        general_phone = general_contacts[0]
        general_fax = general_contacts[1]

        councillors = page.xpath('//tr/td/p/b')
        for councillor in councillors:
            text = councillor.text_content()
            if '@' in text or 'NEWSLETTER' in text:
                continue

            if 'Mayor' in text:
                name = text.replace('Mayor', '')
                district = 'Dollard-Des Ormeaux'
                role = 'Maire'
            else:
                name = re.split(r'[0-9]', text)[1]
                district = 'District ' + re.findall(r'[0-9]', text)[0]
                role = 'Conseiller'

            p = Person(primary_org='legislature', name=name, district=district, role=role)
            p.add_source(COUNCIL_PAGE)
            p.image = councillor.xpath('./parent::p/parent::td/parent::tr/preceding-sibling::tr//img/@src')[0]

            email = self.get_email(councillor, './parent::p/following-sibling::p')
            p.add_contact('email', email)

            p.add_contact('voice', general_phone, 'legislature')
            p.add_contact('fax', general_fax, 'legislature')

            yield p
Beispiel #7
0
    def scrape(self):
        page = self.lxmlize(COUNCIL_PAGE)

        councillors = page.xpath('//div[@id="c2087"]//a')
        for councillor in councillors:
            name = councillor.text_content()
            url = councillor.attrib['href']
            page = self.lxmlize(url)
            if 'Maire' in page.xpath('//h2/text()')[0]:
                district = 'Sherbrooke'
                role = 'Maire'
            else:
                district = page.xpath('//div[@class="csc-default"]//a[@target="_blank"]/text()')[0].replace('district', '').replace('Domaine Howard', 'Domaine-Howard').strip()
                role = 'Conseiller'
            if district in ('de Brompton', 'de Lennoxville'):
                district = district.replace('de ', '')
            p = Person(primary_org='legislature', name=name, district=district, role=role)
            p.add_source(COUNCIL_PAGE)
            p.add_source(url)
            p.image = page.xpath('//div[@class="csc-textpic-image csc-textpic-last"]//img/@src')[0]
            parts = page.xpath('//li[contains(text(), "phone")]/text()')[0].split(':')
            note = parts[0]
            phone = parts[1]
            p.add_contact(note, phone, note)
            email = self.get_email(page)
            if email:
                p.add_contact('email', email)
            if district == 'Brompton':
                p._related[0].extras['boundary_url'] = '/boundaries/sherbrooke-boroughs/brompton/'
            elif district == 'Lennoxville':
                p._related[0].extras['boundary_url'] = '/boundaries/sherbrooke-boroughs/lennoxville/'
            yield p
Beispiel #8
0
    def scrape(self):
        councillor_seat_number = 1

        contact_page = self.lxmlize(CONTACT_URL)
        email = self.get_email(contact_page)

        page = self.lxmlize(COUNCIL_PAGE)
        for url in page.xpath('//a/@href[contains(., "members/")]'):
            page = self.lxmlize(url)
            role, name = page.xpath('//h1//text()')[0].split(' ', 1)
            photo_url = page.xpath('//img/@src')[0]

            if role == 'Mayor':
                district = 'Richmond'
            else:
                district = 'Richmond (seat {})'.format(councillor_seat_number)
                councillor_seat_number += 1

            p = Person(primary_org='legislature', name=name, district=district, role=role)
            p.image = photo_url
            p.add_source(COUNCIL_PAGE)
            p.add_source(CONTACT_URL)
            p.add_source(url)
            p.add_contact('email', email)
            yield p
Beispiel #9
0
    def scrape(self):
        page = self.lxmlize(COUNCIL_PAGE)

        mayor_url = page.xpath('//a[contains(text(), "Mayor")]/@href')[0]
        mayor = self.scrape_mayor(mayor_url)
        if mayor:
            yield mayor

        councillors_url = page.xpath('//a[contains(text(), "Councillors")]/@href')[0]
        cpage = self.lxmlize(councillors_url)

        councillor_rows = cpage.xpath("//tr[td//img]")[:-1]
        for councillor_row in councillor_rows:
            img_cell, info_cell = tuple(councillor_row)
            if info_cell.xpath('.//p//text()[contains(., "Vacant")]'):
                continue
            name = info_cell.xpath('.//p//text()[contains(., "Councillor")]')[0].replace("Councillor ", "")
            district = info_cell.xpath('.//p[contains(text(), "District")]//text()')[0]
            email = self.get_email(info_cell)
            phone = self.get_phone(info_cell, area_codes=[438, 514])
            img_url_rel = img_cell.xpath(".//img/@src")[0]
            img_url = urljoin(councillors_url, img_url_rel)

            p = Person(primary_org="legislature", name=name, district=district, role="Conseiller")
            p.add_source(COUNCIL_PAGE)
            p.add_source(councillors_url)
            p.add_contact("email", email)
            p.add_contact("voice", phone, "legislature")
            p.image = img_url
            yield p
Beispiel #10
0
    def scrape(self):
        page = self.lxmlize(COUNCIL_PAGE, user_agent=CUSTOM_USER_AGENT)

        mayor_url = page.xpath('//a[contains(text(), "Mayor")]/@href')[0]
        mayor = self.scrape_mayor(mayor_url)
        if mayor:
            yield mayor

        councillors_url = page.xpath('//a[contains(text(), "Councillors")]/@href')[0]
        cpage = self.lxmlize(councillors_url, user_agent=CUSTOM_USER_AGENT)

        councillors = cpage.xpath('//tr[td//img]')[:-1]

        assert len(councillors), 'No councillors found'
        for councillor_row in councillors:
            img_cell, info_cell = tuple(councillor_row)
            if info_cell.xpath('.//p//text()[contains(., "Vacant")]'):
                continue
            cells = [x.strip() for x in info_cell.xpath('.//text()') if re.sub('\xa0', ' ', x).strip()]
            name = cells[0].replace('Councillor ', '')
            district = info_cell.xpath('.//p[contains(text(), "District")]//text()')[0]
            email = self.get_email(info_cell)
            phone = self.get_phone(info_cell, area_codes=[438, 514], error=False)
            img_url_rel = img_cell.xpath('.//img/@src')[0]
            img_url = urljoin(councillors_url, img_url_rel)

            p = Person(primary_org='legislature', name=name, district=district, role='Conseiller')
            p.add_source(COUNCIL_PAGE)
            p.add_source(councillors_url)
            p.add_contact('email', email)
            if phone:
                p.add_contact('voice', phone, 'legislature')
            p.image = img_url
            yield p
Beispiel #11
0
    def scrape(self):
        councillor_seat_number = 1

        page = self.lxmlize(COUNCIL_PAGE)
        councillors = page.xpath('//div[contains(@class, "entry")]')[0].xpath('.//@href')
        assert len(councillors), 'No councillors found'
        for url in councillors:
            if '@' in url:
                continue

            page = self.lxmlize(url)
            main = page.xpath('//main[@id="content"]')[0]

            name = main.xpath('.//h1//text()')[0]

            if 'Mayor' in main.text_content():
                name = name.replace('Mayor ', '')
                role = 'Mayor'
                district = 'Saanich'
            else:
                role = 'Councillor'
                district = 'Saanich (seat {})'.format(councillor_seat_number)
                councillor_seat_number += 1

            p = Person(primary_org='legislature', name=name, district=district, role=role)
            p.image = page.xpath('.//@src')[0]
            p.add_contact('voice', self.get_phone(page, area_codes=[250]), 'legislature')
            p.add_contact('email', self.get_email(page.xpath('//main[@id="content"]')[0]))
            p.add_source(COUNCIL_PAGE)
            p.add_source(url)
            yield p
Beispiel #12
0
    def scrape(self):
        def char(code):
            try:
                return chr(int(code))
            except ValueError:
                return code

        page = self.lxmlize(COUNCIL_PAGE)
        for row in page.xpath('//div[@id="content"]/table/tbody/tr'):
            if 'Vacant' not in row.xpath('./td//text()')[0]:
                full_name, party, district = row.xpath('./td//text()')[:3]
                name = ' '.join(reversed(full_name.split(',')))

                p = Person(primary_org='legislature', name=name, district=district, role='MLA', party=self.PARTIES[party])

                detail_url = row[0][0].attrib['href']
                detail = self.lxmlize(detail_url)

                image = detail.xpath('//img[@class="portrait"]/@src')[0]
                p.image = image

                try:
                    p.add_contact('voice', detail.xpath('//dd[@class="numbers"]/text()')[0].split(': ')[1], 'legislature')
                except IndexError:
                    pass

                script = detail.xpath('//dd/script/text()')
                if script:
                    codes = reversed(re.findall(r"]='(.+?)'", script[0]))
                    content = ''.join(char(code) for code in codes)
                    p.add_contact('email', re.search(r'>(.+)<', content).group(1))

                p.add_source(COUNCIL_PAGE)
                p.add_source(detail_url)
                yield p
Beispiel #13
0
    def scrape_mayor(self, url):
        page = self.lxmlize(url)
        name = page.xpath('//img/@alt[contains(., "Mayor")]')[0].split(' ', 1)[1]

        p = Person(primary_org='legislature', name=name, district='Markham', role='Mayor')
        p.add_source(url)

        yield p
Beispiel #14
0
    def scrape(self):
        page = self.lxmlize(COUNCIL_PAGE)

        corrections = {
            'Mackenzie Delta': 'Mackenzie-Delta',
            'Tu Nedhe - Wiilideh': 'Tu Nedhe',
        }

        member_cells = page.xpath('//div[@class="views-field views-field-field-picture"]/parent::td')
        for cell in member_cells:
            name = cell[1].text_content().replace(' .', '. ')  # typo on page
            riding = cell[2].text_content().strip()
            riding = corrections.get(riding, riding)

            detail_url = cell[0].xpath('.//a/@href')[0]
            detail_page = self.lxmlize(detail_url)
            photo_url = detail_page.xpath('//div[@class="field-item even"]/img/@src')[0]
            email = self.get_email(detail_page)

            contact_text = ''.join(detail_page.xpath('//div[@property="content:encoded"]/p[1]//text()'))
            phone = re.search(r'P(hone)?: ([-0-9]+)', contact_text)

            p = Person(primary_org='legislature', name=name, district=riding, role='MLA', image=photo_url)
            p.add_source(COUNCIL_PAGE)
            p.add_source(detail_url)
            p.add_contact('email', email)
            if phone:
                p.add_contact('voice', phone.group(2), 'legislature')
            yield p
Beispiel #15
0
    def scrape(self):
        councillor_seat_number = 1

        page = self.lxmlize(COUNCIL_PAGE)

        for person_url in page.xpath('//h4/a/@href'):
            page = self.lxmlize(person_url)

            role, name = page.xpath('//title//text()')[0].split(' ', 1)
            photo_url = page.xpath('//div[@id="content"]//img[@style]/@src')[0]

            contact_node = page.xpath('//div[@id="column-right"]//div[contains(., "Contact")]')
            if contact_node:
                email = self.get_email(contact_node[0])
                phone = self.get_phone(contact_node[0], area_codes=[604, 778])

            if role == 'Mayor':
                district = 'Burnaby'
            else:
                district = 'Burnaby (seat {})'.format(councillor_seat_number)
                councillor_seat_number += 1

            p = Person(primary_org='legislature', name=name, district=district, role=role, image=photo_url)
            p.add_source(COUNCIL_PAGE)
            p.add_source(person_url)
            p.add_contact('email', email)
            if phone:
                p.add_contact('voice', phone, 'legislature')
            yield p
Beispiel #16
0
 def scrape(self):
     csv_text = self.get(self.get_csv_url()).text
     cr = csv.DictReader(csv_text.split('\n'))
     for mla in cr:
         name = '{} {} {}'.format(mla['MLA First Name'], mla['MLA Middle Names'], mla['MLA Last Name'])
         if name.strip() == '':
             continue
         party = get_party(mla['Caucus'])
         name_without_status = name.split(',')[0]
         detail_url = (
             'http://www.assembly.ab.ca/net/index.aspx?'
             'p=mla_contact&rnumber={0}&leg=29'.format(
                 mla['Riding Number']
             )
         )
         detail_page = self.lxmlize(detail_url)
         photo_url = detail_page.xpath('//img[@class="MemPhoto"]/@src')[0]
         p = Person(
             primary_org='legislature',
             name=name_without_status,
             district=mla['Riding Name'],
             role='MLA',
             party=party,
             image=photo_url,
         )
         p.add_source(COUNCIL_PAGE)
         p.add_source(detail_url)
         if mla['Email']:
             p.add_contact('email', mla['Email'])
         if mla['Phone Number']:
             p.add_contact('voice', mla['Phone Number'], 'legislature')
         yield p
Beispiel #17
0
    def scrape(self):
        page = self.lxmlize(COUNCIL_PAGE)

        yield self.scrape_mayor()

        councillors = page.xpath('//h2[@class="landing-block-title"]/a')[:-1]
        for councillor in councillors:
            url = councillor.attrib['href']
            page = self.lxmlize(url)

            district = page.xpath('//div[@id="main-content"]/h1/text()')[0]
            name = page.xpath('//div[@id="main-content"]/h2/text()')[0]

            p = Person(primary_org='legislature', name=name, district=district, role='Councillor')
            p.add_source(COUNCIL_PAGE)
            p.add_source(url)

            contacts = page.xpath('//aside[@class="page-sidebar"]/div[1]/p')
            for contact in contacts[:-1]:
                contact_type = contact.xpath('./strong/text()')[0]
                if 'Contact' in contact_type:
                    continue
                value = contact.xpath('./a/text()')[0]
                if 'Fax' in contact_type:
                    p.add_contact('fax', value, 'legislature')
                if 'Phone' in contact_type:
                    p.add_contact(contact_type, value, contact_type)

            yield p
Beispiel #18
0
    def scrape_mayor(self):
        page = self.lxmlize(MAYOR_PAGE)
        name = page.xpath('//h1[contains(text(), "Mayor")]/text()')[0].replace('Mayor', '').strip()

        p = Person(primary_org='legislature', name=name, district='Edmonton', role='Mayor')
        p.add_source(MAYOR_PAGE)

        address = ' '.join(page.xpath('//address/p/text()'))
        p.add_contact('address', address, 'legislature')

        return p
Beispiel #19
0
    def scrape(self):
        response = urlopen(COUNCIL_PAGE).read()
        pdf = open('/tmp/ns.pdf', 'w')
        pdf.write(response)
        pdf.close()

        data = subprocess.check_output(['pdftotext', '/tmp/ns.pdf', '-'])
        emails = re.findall(r'(?<=E-mail: ).+', data)
        data = re.split(r'Mayor |Warden ', data)[1:]
        for i, mayor in enumerate(data):
            lines = mayor.splitlines(True)
            name = lines.pop(0).strip()
            if name == "Jim Smith":
                continue
            district = lines.pop(0).strip()
            if not re.findall(r'[0-9]', lines[0]):
                district = district + ' ' + lines.pop(0).strip()

            org = Organization(name=district + ' Municipal Council', classification='legislature', jurisdiction_id=self.jurisdiction.jurisdiction_id)
            org.add_source(COUNCIL_PAGE)
            yield org

            p = Person(primary_org='legislature', name=name, district=district)
            p.add_source(COUNCIL_PAGE)
            membership = p.add_membership(org, role='Mayor', district=district)

            address = lines.pop(0).strip() + ', ' + lines.pop(0).strip()
            if 'Phone' not in lines[0]:
                address = address + ', ' + lines.pop(0).strip()

            if 'Phone' not in lines[0]:
                address = address + ', ' + lines.pop(0).strip()

            phone = lines.pop(0).split(':')[1].strip()
            if 'Fax' in lines.pop(0):
                fax = lines.pop(0)

            membership.add_contact_detail('address', address, 'legislature')
            membership.add_contact_detail('voice', phone, 'legislature')
            membership.add_contact_detail('fax', fax, 'legislature')
            # @todo emails are being assigned incorrectly, e.g. Town of Berwick picks
            # up Cape Breton Regional Municipality and Region of Queens Municipality
            for i, email in enumerate(emails):
                regex = name.split()[-1].lower() + '|' + '|'.join(district.split()[-2:]).replace('of', '').lower()
                regex = regex.replace('||', '|')
                matches = re.findall(r'{}'.format(regex), email)
                if matches:
                    membership.add_contact_detail('email', emails.pop(i))
            yield p

        os.system('rm /tmp/ns.pdf')
Beispiel #20
0
    def scrape(self):
        councillor_seat_number = 1

        coun_page = self.lxmlize(COUNCIL_PAGE)
        contact_page = self.lxmlize(CONTACT_PAGE)
        councillors = coun_page.xpath('//div[@id="main-content"]//h3')
        contact_data = contact_page.xpath('//p[contains(./strong/text(), "Mayor & Council")]/following-sibling::table[1]//tr')[1:]

        for councillor, contact in zip(councillors, contact_data):
            text = councillor.text_content()
            if text.startswith('Councill'):
                role = 'Councillor'
                district = 'Abbotsford (seat {})'.format(councillor_seat_number)
                councillor_seat_number += 1
            else:
                role = 'Mayor'
                district = 'Abbotsford'
            name = text.split(' ', 1)[1]
            image = councillor.xpath('./img/@src')[0]
            phone = contact.xpath('./td[2]/text()')[0]
            fax = contact.xpath('./td[3]/text()')[0]

            p = Person(primary_org='legislature', name=name, district=district, role=role)
            p.add_source(COUNCIL_PAGE)
            p.add_source(CONTACT_PAGE)
            p.image = image
            p.add_contact('voice', phone, 'legislature')
            p.add_contact('fax', fax, 'legislature')

            yield p
Beispiel #21
0
    def scrape(self):
        page = self.lxmlize(COUNCIL_PAGE)

        councillor_trs = [tr for tr in page.xpath('//table//tr[1]') if len(tr) == 2][:-1]
        for councillor_tr in councillor_trs:
            desc = [text.strip() for text in councillor_tr.xpath('.//text()[normalize-space()]') if text.strip()]

            if len(desc) == 3:
                role = 'Maire'
                district = 'Saint-Jérôme'
            else:
                role = 'Conseiller'
                district = desc[0].replace('numéro ', '')

            name = desc[-3]
            phone = desc[-2]
            email = desc[-1]

            image = councillor_tr.xpath('.//img/@src')[0]

            p = Person(primary_org='legislature', name=name, district=district, role=role)
            p.add_source(COUNCIL_PAGE)
            p.image = image
            p.add_contact('voice', phone, 'legislature')
            p.add_contact('email', email)
            yield p
Beispiel #22
0
    def scrape(self):
        page = self.lxmlize(COUNCIL_PAGE, 'iso-8859-1')

        councillors = page.xpath('//div[@id="PageContent"]/table/tbody/tr/td')
        for councillor in councillors:
            if not councillor.text_content().strip():
                continue
            if councillor == councillors[0]:
                district = 'Kirkland'
                role = 'Maire'
            else:
                district = councillor.xpath('.//h2')[0].text_content()
                district = re.search('- (.+)', district).group(1).strip()
                district = district.replace(' Ouest', ' ouest').replace(' Est', ' est')
                role = 'Conseiller'

            name = councillor.xpath('.//strong/text()')[0]

            phone = councillor.xpath('.//div[contains(text(), "#")]/text()')[0].replace('T ', '').replace(' ', '-').replace(',-#-', ' x')
            email = self.get_email(councillor)

            p = Person(primary_org='legislature', name=name, district=district, role=role)
            p.add_source(COUNCIL_PAGE)
            p.add_contact('voice', phone, 'legislature')
            p.add_contact('email', email)
            p.image = councillor.xpath('.//img/@src')[0]
            yield p
Beispiel #23
0
    def scrape(self):
        regional_councillor_seat_number = 1
        page = self.lxmlize(COUNCIL_PAGE)

        councillors = page.xpath('//a[@title="Mayor and Council::Meet Your Council"]/following-sibling::ul//@href')
        assert len(councillors), 'No councillors found'
        for councillor in councillors:
            node = self.lxmlize(councillor).xpath('//div[@id="printArea"]')[0]
            name = node.xpath('.//h1/text()')[0]

            if 'Mayor' in name:
                role = 'Mayor'
                district = 'Whitby'
                name = name.replace('Mayor ', '')
            else:
                role = node.xpath('.//h2/text()')[0]
                if 'Regional Councillor' in role:
                    district = 'Whitby (seat {})'.format(regional_councillor_seat_number)
                    regional_councillor_seat_number += 1
                else:
                    role, district = role.split(', ')
                    district = district.split(' (')[0]

            image = node.xpath('.//img/@src')[0]

            p = Person(primary_org='legislature', name=name, district=district, role=role)
            p.add_source(COUNCIL_PAGE)
            p.add_contact('voice', self.get_phone(node), 'legislature')
            p.add_contact('email', self.get_email(node))
            p.image = image

            yield p
Beispiel #24
0
    def scrape(self):
        # mayor first, can't find email
        page = self.lxmlize(MAYOR_URL)
        photo_url = page.xpath('//img/@src[contains(., "maire")]')[0]
        name = page.xpath('//td[@class="contenu"]/text()[last()]')[0]
        p = Person(primary_org='legislature', name=name, district="Trois-Rivières", role="Maire",
                   image=photo_url)
        p.add_source(MAYOR_URL)
        yield p

        resp = self.get(COUNCIL_PAGE)
        # page rendering through JS on the client
        page_re = re.compile(r'createItemNiv3.+"District (.+?)".+(index.+)\\"')
        for district, url_rel in page_re.findall(resp.text):
            if district not in ('des Estacades', 'des Plateaux', 'des Terrasses', 'du Sanctuaire'):
                district = re.sub('\A(?:de(?: la)?|des|du) ', '', district)

            url = urljoin(COUNCIL_PAGE, url_rel)
            page = self.lxmlize(url)

            name_content = page.xpath('//h2//text()')
            if name_content:
                name = name_content[0]
                email = self.get_email(page)
                photo_url = page.xpath('//img/@src[contains(., "Conseiller")]')[0]
                p = Person(primary_org='legislature', name=name, district=district, role='Conseiller',
                           image=photo_url)
                p.add_source(url)
                p.add_contact('email', email)
                yield p
Beispiel #25
0
    def scrape(self):
        councillor_seat_number = 1

        page = self.lxmlize(COUNCIL_PAGE)
        nodes = page.xpath('//div[@class="view-content"]/div')
        for node in nodes:
            fields = node.xpath('./div')
            role = fields[0].xpath('./div//text()')[0]
            name = fields[2].xpath('.//a//text()')[0].title().split(role)[-1].strip()
            if name == 'Vacant':
                continue

            if 'Ward' in role:
                district = role
                role = 'Councillor'
            else:
                if 'At Large' in role:
                    role = 'Councillor at Large'
                    district = "St. John's (seat {})".format(councillor_seat_number)
                    councillor_seat_number += 1
                else:
                    district = "St. John's"
            phone = fields[3].xpath('./div//text()')[0]
            email = self.get_email(fields[5])
            photo_url = node.xpath('.//img/@src')[0]

            p = Person(primary_org='legislature', name=name, district=district, role=role)
            p.add_source(COUNCIL_PAGE)
            p.add_contact('voice', phone, 'legislature')
            p.add_contact('email', email)
            p.image = photo_url
            yield p
Beispiel #26
0
    def scrape(self):

        page = self.lxmlize(COUNCIL_PAGE)

        councillors = page.xpath('//p[@class="WSIndent"]/a')
        for councillor in councillors:
            district = re.findall(r'(Ward [0-9]{1,2})', councillor.text_content())
            if district:
                district = district[0]
                name = councillor.text_content().replace(district, '').strip()
                role = 'Councillor'
            else:
                district = 'Kawartha Lakes'
                name = councillor.text_content().replace('Mayor', '').strip()
                role = 'Mayor'

            url = councillor.attrib['href']
            page = self.lxmlize(url)
            email = self.get_email(page)
            image = page.xpath('//img[@class="image-right"]/@src')[0]

            p = Person(primary_org='legislature', name=name, district=district, role=role)
            p.add_source(COUNCIL_PAGE)
            p.add_source(url)
            p.add_contact('email', email)
            p.image = image
            yield p
Beispiel #27
0
    def scrape(self):
        page = self.lxmlize(COUNCIL_PAGE)

        councillors = page.xpath('//section[contains(@id, "js-council-member")]')
        assert len(councillors), 'No councillors found'
        for index, councillor in enumerate(councillors):

            name = ' '.join([n.strip() for n in councillor.xpath('.//h2/text()')])
            district = councillor.xpath('.//span[contains(@class, "c-info-list_label")][contains(text(), "District ")]')
            role = 'Conseiller'

            if not district and index == 0:
                district = 'Pointe-Claire'
                role = 'Maire'
            elif district:
                district = district[0].text_content().split(' – ')[0].strip()
            else:
                assert False, "error parsing district"

            p = Person(primary_org='legislature', name=name, district=district, role=role)
            p.image = councillor.xpath('.//@src')[0]
            p.add_contact('email', self.get_email(councillor))
            p.add_contact('voice', self.get_phone(councillor, area_codes=[514]), 'legislature')
            p.add_source(COUNCIL_PAGE)
            yield p
Beispiel #28
0
    def scrape(self):
        page = self.lxmlize(COUNCIL_PAGE, 'iso-8859-1')

        yield self.scrape_mayor()

        councillors = page.xpath('//div[@class="articlebody-inside"]//p[contains(text(),"-")]')
        for councillor in councillors:
            url = councillor.xpath('.//a')[0].attrib['href'].replace('../', '')
            page = self.lxmlize(url, 'iso-8859-1')

            name = page.xpath('//div[@class="articletitle"]/h1')[0].text_content().replace('Councillor', '').replace('Deputy Mayor', '')
            district = 'Ward {}'.format(re.sub(r'\D+', '', page.xpath('//div[@class="articlebody-inside"]/p')[0].text_content()))

            p = Person(primary_org='legislature', name=name, district=district, role='Councillor')
            p.add_source(COUNCIL_PAGE)
            p.add_source(url)

            photo_url_rel = page.xpath('//div[@class="articlebody-inside"]/p/img/@src')[0].replace('/..', '')
            p.image = urljoin(url, photo_url_rel)

            contacts = page.xpath('//div[@class="articlebody-inside"]/p')[1].text_content().replace('Biography', '').replace('Committees', '').split(':')
            for i, contact in enumerate(contacts):
                if i == 0 or not contact:
                    continue
                contact_type = re.findall(r'([A-Z][a-z]+)', contacts[i - 1])[0]
                if contact_type != 'Address':
                    contact = re.split(r'[A-Z]', contact)[0]
                contact_type = CONTACT_DETAIL_TYPE_MAP[contact_type]
                p.add_contact(contact_type, contact, '' if contact_type == 'email' else 'legislature')
            yield p
Beispiel #29
0
    def scrape_mayor(self, url):
        page = self.lxmlize(url)

        name = page.xpath('//div[@id="printAreaContent"]/h1/strong/text()')[0].replace('Mayor', '').strip()
        address = page.xpath('//strong[contains(text(), "mail")]/parent::p/text()')[1].replace(':', '').strip()
        phone = page.xpath('//strong[contains(text(), "phone")]/parent::p/text()')[1].split()[1]

        p = Person(primary_org='legislature', name=name, district='Caledon', role='Mayor')
        p.add_source(COUNCIL_PAGE)
        p.add_source(url)
        p.image = page.xpath('//h2[contains(text(), "About me")]/img/@src')[0]
        p.add_contact('address', address, 'legislature')
        p.add_contact('voice', phone, 'legislature')
        return p
Beispiel #30
0
    def scrape(self):
        member_page = self.lxmlize(COUNCIL_PAGE, encoding='utf-8')
        table = member_page.xpath('//table')[0]
        rows = table.xpath('.//tr')[1:]
        assert len(rows), 'No members found'
        for row in rows:
            (namecell, constitcell, partycell) = row.xpath('.//td')
            full_name = namecell.text_content().strip()
            if full_name.lower() == 'vacant':
                continue
            (last, first) = full_name.split(',')
            name = first.replace('Hon.', '').strip() + ' ' + last.title().strip()
            district = ' '.join(constitcell.text_content().split())
            party = get_party(partycell.text)

            url = namecell.xpath('.//a')[0].get('href')

            page = self.lxmlize(url)
            email = self.get_email(page)

            p = Person(primary_org='legislature', name=name, district=district, role='MLA', party=party)
            p.add_source(COUNCIL_PAGE)
            p.add_source(url)
            p.add_contact('email', email)

            image = page.xpath('//img[@class="page_graphic"]/@src')
            if image:
                p.image = image[0]

            yield p
Beispiel #31
0
    def scrape(self):
        page = self.lxmlize(COUNCIL_PAGE)
        types = page.xpath(
            '//div[@class="bluearrow shaded bottomborder "][1]/ul/li/a/@href'
        )[:4]
        for org_type, link in enumerate(types):
            page = self.lxmlize(link)
            district_urls = page.xpath(
                '//div[@class="parbase list section cplist"]/table/tr/td[1]/b/a/@href'
            )
            for district_url in district_urls:
                page = self.lxmlize(district_url)
                district = page.xpath('//div[@class="pageHeader"]/h1/text()'
                                      )[0].split(' - ')[1].strip()

                org = Organization(
                    name=district + org_types[org_type],
                    classification='legislature',
                    jurisdiction_id=self.jurisdiction.jurisdiction_id)
                org.add_source(district_url)
                yield org

                address = ', '.join(
                    page.xpath('//div[@class="left_contents"]/p[1]/text()'))
                contacts = page.xpath(
                    '//div[@class="left_contents"]/p[b[text() = "Contact"]]/text()'
                )
                phone = contacts[0].split(':')[1].strip().replace(' ', '-')
                fax = contacts[1].split(':')[1].strip().replace(' ', '-')
                email = self.get_email(page, '//div[@class="left_contents"]')

                site = page.xpath(
                    '//div[@class="left_contents"]//a[not(contains(@href,"mailto:"))]'
                )
                if site:
                    site = site[0].text_content()

                councillors = page.xpath(
                    '//div[@class="right_contents"]//p/text()')
                for i, councillor in enumerate(councillors):
                    if 'Vacant' in councillor:
                        continue
                    p = Person(primary_org='legislature',
                               name=councillor,
                               district=district)
                    p.add_source(COUNCIL_PAGE)
                    p.add_source(link)
                    p.add_source(district_url)

                    if i == 0:
                        membership = p.add_membership(org, role='Mayor')
                    else:
                        membership = p.add_membership(org, role='Councillor')

                    membership.post_id = district
                    membership.add_contact_detail('address', address,
                                                  'legislature')
                    if phone:
                        membership.add_contact_detail('voice', phone,
                                                      'legislature')
                    if fax:
                        membership.add_contact_detail('fax', fax,
                                                      'legislature')
                    if email:
                        membership.add_contact_detail('email', email)
                    if site:
                        p.add_link(site)
                    yield p
Beispiel #32
0
    def scrape_mayor(self, div):
        name = div.xpath('.//a')[0].text_content()
        url = div.xpath('.//a/@href')[0]
        page = self.lxmlize(url)
        contact_url = page.xpath('//a[@title="Joindre le maire"]/@href')[0]
        contact_page = self.lxmlize(contact_url)

        p = Person(primary_org='legislature',
                   name=name,
                   district='Saint-Jean-sur-Richelieu',
                   role='Maire')
        p.add_source(COUNCIL_PAGE)
        p.add_source(url)
        p.add_source(contact_url)

        p.image = div.xpath('./preceding-sibling::td//img/@src')[-1]

        contacts = contact_page.xpath(
            '//div[@id="ctl00_PlaceHolderMain_ctl01_ctl01__ControlWrapper_RichHtmlField"]//div/font/text()'
        )
        address = ' '.join(contacts[:4])
        phone = contacts[-3].split(':')[1].strip().replace(' ', '-')
        fax = contacts[-2].split(':')[1].strip().replace(' ', '-')
        p.add_contact('address', address, 'legislature')
        p.add_contact('voice', phone, 'legislature')
        p.add_contact('fax', fax, 'legislature')
        # mayor's email is a form
        return p
Beispiel #33
0
    def scrape(self):
        councillor_seat_number = 1
        page = self.lxmlize(COUNCIL_PAGE)

        mayor = page.xpath('//div/a[contains(@title, "Profile")][1]/@href')
        councillors = mayor + page.xpath(
            '//td//a[contains(@title, "Profile")][1]/@href')
        assert len(councillors), 'No councillors found'
        for councillor in councillors:
            page = self.lxmlize(councillor)
            info = page.xpath('//table/tbody/tr/td[2]')[0]

            for br in info.xpath('*//br'):
                br.tail = '\n' + br.tail if br.tail else '\n'
            lines = [
                line.strip() for line in info.text_content().split('\n')
                if line.strip()
            ]
            name = lines[0].replace('Councillor ', '').replace('Mayor ', '')

            if lines[1].endswith(' Ward'):
                district = lines[1].replace(' Ward', '')
                role = 'Councillor'
            elif 'At Large' in lines[1]:
                role = 'Councillor at Large'
                district = 'Thunder Bay (seat {})'.format(
                    councillor_seat_number)
                councillor_seat_number += 1
            else:
                district = 'Thunder Bay'
                role = 'Mayor'
            name = name.replace('Councillor',
                                '').replace('At Large',
                                            '').replace('Mayor', '').strip()

            p = Person(primary_org='legislature',
                       name=name,
                       district=district,
                       role=role)
            p.add_source(COUNCIL_PAGE)
            p.add_source(councillor)

            p.image = page.xpath('//td[@valign="top"]/img/@src')[0]

            address = ', '.join(info.xpath('./p/text()')[0:2]).strip()
            address = re.sub(r'\s{2,}', ' ', address)

            p.add_contact('address', address, 'legislature')

            contacts = filter(None, (text.strip()
                                     for text in info.xpath('./p[2]/text()')))
            for contact in contacts:
                contact_type, contact = contact.replace('Cel:',
                                                        'Cell:').split(':')
                contact = contact.replace('(1st)', '').replace('(2nd)',
                                                               '').strip()
                if 'Fax' in contact_type:
                    p.add_contact('fax', contact, 'legislature')
                elif 'Email' in contact_type:
                    break
                else:
                    p.add_contact('voice', contact, contact_type)

            email = self.get_email(info)
            p.add_contact('email', email)

            yield p
Beispiel #34
0
    def scrape(self):
        response = urlopen(COUNCIL_PAGE).read()
        pdf = open('/tmp/sk.pdf', 'w')
        pdf.write(response)
        pdf.close()

        data = subprocess.check_output(
            ['pdftotext', '-layout', '/tmp/sk.pdf', '-'])

        data = data.splitlines(True)
        pages = []
        page = []
        for line in data:
            if line.strip(
            ) and 'Page' not in line and 'CITIES' not in line and 'NORTHERN TOWNS, VILLAGES' not in line:
                page.append(line)
            elif page:
                pages.append(page)
                page = []

        districts = []
        for page in pages:
            index = re.search(r'(\s{6,})', page[0])
            if index:
                index = index.end() - 1
            else:
                index = -1
            dist1 = []
            dist2 = []
            for line in page:
                dist1.append(line[:index].strip())
                dist2.append(line[index:].strip())
            districts.append(dist1)
            districts.append(dist2)

        for district in districts:

            district_name = district.pop(0).split(',')[0].title()

            org = Organization(
                name=district_name + ' Council',
                classification='legislature',
                jurisdiction_id=self.jurisdiction.jurisdiction_id)
            org.add_source(COUNCIL_PAGE)

            councillors = []
            contacts = {}
            for i, line in enumerate(district):
                if 'Phone' in line:
                    phone = line.split(':')[1].replace('(', '').replace(
                        ') ', '-').strip()
                    if phone:
                        contacts['voice'] = phone
                if 'Fax' in line:
                    fax = line.split(':')[1].replace('(',
                                                     '').replace(') ',
                                                                 '-').strip()
                    if fax:
                        contacts['fax'] = fax
                if 'E-Mail' in line:
                    email = line.split(':')[1].strip()
                    if email:
                        contacts['email'] = email
                if 'Address' in line and line.split(':')[1].strip():
                    address = line.split(':')[1].strip() + ', ' + ', '.join(
                        district[i + 1:]).replace(' ,', '')
                    contacts['address'] = address
                if 'Mayor' in line or 'Councillor' in line or 'Alderman' in line:
                    councillor = line.split(':')[1].replace('Mr.', '').replace(
                        'Mrs.',
                        '').replace('Ms.',
                                    '').replace('His Worship',
                                                '').replace('Her Worship',
                                                            '').strip()
                    role = line.split(':')[0].strip()
                    if councillor:
                        councillors.append([councillor, role])

            if not councillors:
                continue
            yield org
            for councillor in councillors:
                p = Person(primary_org='legislature',
                           name=councillor[0],
                           district=district_name)
                p.add_source(COUNCIL_PAGE)
                membership = p.add_membership(org,
                                              role=councillor[1],
                                              district=district_name)

                for key, value in contacts.items():
                    membership.add_contact_detail(
                        key, value, '' if key == 'email' else 'legislature')
                yield p
        os.system('rm /tmp/sk.pdf')
Beispiel #35
0
def scrape_mayor(div, name):

    p = Person(primary_org='legislature', name=name, district='Wilmot', role='Mayor')
    p.add_source(COUNCIL_PAGE)

    info = div.xpath('./parent::p//text()')
    info.pop(0)
    address = ' '.join(info[:3])
    phone = info[3].split()[1]
    fax = info[4].split()[1]
    email = info[-1]
    p.add_contact('address', address, 'legislature')
    p.add_contact('voice', phone, 'legislature')
    p.add_contact('fax', fax, 'legislature')
    p.add_contact('email', email)
    return p
Beispiel #36
0
    def scrape(self):
        screen_names = json.loads(
            self.get('http://scrapers-ruby.herokuapp.com/twitter_users').text)

        page = self.lxmlize(COUNCIL_PAGE)
        rows = page.xpath('//div[@class="main-content"]//tr')[1:]
        assert len(rows), 'No members found'
        for row in rows:
            name_cell = row.xpath('./td[1]')[0]
            last_name = name_cell.xpath('.//span[1]//text()')[0]
            first_name = name_cell.xpath('.//span[2]//text()')[0]
            name = '{} {}'.format(first_name, last_name)
            constituency = row.xpath('./td[2]//text()')[0].replace(
                '–', '—')  # n-dash, m-dash
            if constituency == 'Mont-Royal':
                constituency = 'Mount Royal'
            province = row.xpath('./td[3]//text()')[0]
            party = row.xpath('string(./td[4])')  # allow string()
            url = name_cell.xpath('.//a/@href')[0]
            if province == 'Québec':
                url = url.replace('/en/', '/fr/')

            mp_page = self.lxmlize(url)
            email = self.get_email(mp_page,
                                   '//span[@class="caucus"]',
                                   error=False)
            photo = mp_page.xpath(
                '//div[@class="profile overview header"]//img/@src')[0]

            m = Person(primary_org='lower',
                       name=name,
                       district=constituency,
                       role='MP',
                       party=party)
            m.add_source(COUNCIL_PAGE)
            m.add_source(url)
            screen_name = screen_names.get(name)
            if screen_name:
                m.add_link('https://twitter.com/{}'.format(screen_name))
            # @see http://www.parl.gc.ca/Parliamentarians/en/members/David-Yurdiga%2886260%29
            if email:
                m.add_contact('email', email)
            elif name == 'Adam Vaughan':
                m.add_contact('email', '*****@*****.**')

            if photo:
                # Determine whether the photo is actually a generic silhouette
                photo_response = self.get(photo)
                if (photo_response.status_code == 200
                        and hashlib.sha1(photo_response.content).hexdigest()
                        not in IMAGE_PLACEHOLDER_SHA1):
                    m.image = photo

            personal_url = mp_page.xpath(
                '//a[contains(@title, "Personal Web Site")]/@href')
            if personal_url:
                m.add_link(personal_url[0])

            if province == 'Québec':
                m.add_contact('address',
                              'Chambre des communes\nOttawa ON  K1A 0A6',
                              'legislature')
            else:
                m.add_contact('address',
                              'House of Commons\nOttawa ON  K1A 0A6',
                              'legislature')
            voice = mp_page.xpath(
                '//div[@class="hilloffice"]//span//text()[contains(., "Telephone:")]|//div[@class="hilloffice"]//span//text()[contains(., "Téléphone :")]'
            )[0].replace('Telephone: ', '').replace('Téléphone : ', '')
            if voice:
                m.add_contact('voice', voice, 'legislature')
            fax = mp_page.xpath(
                '//div[@class="hilloffice"]//span//text()[contains(., "Fax:")]|//div[@class="hilloffice"]//span//text()[contains(., "Télécopieur :")]'
            )[0].replace('Fax: ', '').replace('Télécopieur : ', '')
            if fax:
                m.add_contact('fax', fax, 'legislature')

            for i, li in enumerate(
                    mp_page.xpath('//div[@class="constituencyoffices"]//li')):
                spans = li.xpath('./span[not(@class="spacer")]')
                note = 'constituency'
                if i:
                    note += ' ({})'.format(i + 1)
                m.add_contact(
                    'address',
                    '\n'.join([
                        spans[0].text_content(),  # address line 1
                        spans[1].text_content(),  # address line 2
                        spans[2].text_content(),  # city, region
                        spans[3].text_content(),  # postal code
                    ]),
                    note)
                voice = li.xpath(
                    './span//text()[contains(., "Telephone:")]|./span//text()[contains(., "Téléphone :")]'
                )
                if voice:
                    voice = voice[0].replace('Telephone: ',
                                             '').replace('Téléphone : ', '')
                    if voice:
                        m.add_contact('voice', voice, note)
                fax = li.xpath(
                    './span//text()[contains(., "Fax:")]|./span//text()[contains(., "Télécopieur :")]'
                )
                if fax:
                    fax = fax[0].replace('Fax: ',
                                         '').replace('Télécopieur : ', '')
                    if fax:
                        m.add_contact('fax', fax, note)

            yield m
Beispiel #37
0
    def scrape_mayor(self):
        page = self.lxmlize(MAYOR_PAGE)
        image = page.xpath('//img[contains(@alt, "Mayor")]/@src')[0]

        contact_url = page.xpath(
            '//a[contains(text(), "Contact the Mayor")]/@href')[0]
        contact_page = self.lxmlize(contact_url)

        infos = contact_page.xpath(
            '//h4[contains(text(), "Address")]/following-sibling::p')
        name = ' '.join(infos[0].text_content().split('\n')[0].split()[2:])
        address = ' '.join(infos[0].text_content().split('\n')[1:])
        phone = infos[1].text_content().split('\n')[0].replace('Phone', '')
        fax = infos[1].text_content().split('\n')[1].replace('Fax', '')

        p = Person(primary_org='legislature',
                   name=name,
                   district='Saskatoon',
                   role='Mayor')
        p.add_source(MAYOR_PAGE)
        p.add_source(contact_url)
        p.image = image
        p.add_contact('address', address, 'legislature')
        p.add_contact('voice', phone, 'legislature')
        p.add_contact('fax', fax, 'legislature')
        return p
Beispiel #38
0
    def scrape(self):
        seat_numbers = defaultdict(int)

        page = self.lxmlize(COUNCIL_PAGE)

        yield self.scrape_mayor()

        councillors = page.xpath('//div[@id="centre_content"]//tr')
        for councillor in councillors:
            if 'Position' in councillor.text_content():
                continue

            ward = councillor.xpath('./td')[0].text_content().replace(
                'Councillor', '')
            seat_numbers[ward] += 1
            district = '{} (seat {})'.format(ward, seat_numbers[ward])
            name = councillor.xpath('./td')[1].text_content()
            url = councillor.xpath('./td/a')[0].attrib['href']

            p = Person(primary_org='legislature',
                       name=name,
                       district=district,
                       role='Councillor')
            p.add_source(COUNCIL_PAGE)
            p.add_source(url)

            page = self.lxmlize(url)

            content = page.xpath('//div[@id="centre_content"]')[0]
            email = self.get_email(content)
            p.add_contact('email', email)
            p.add_contact('voice',
                          self.get_phone(content, area_codes=[226, 519]),
                          'legislature')

            p.image = page.xpath(
                'string(//div[@id="centre_content"]//img/@src)'
            )  # can be empty

            if len(page.xpath('//div[@id="centre_content"]//a')) > 2:
                p.add_link(
                    page.xpath('//div[@id="centre_content"]//a')
                    [-1].attrib['href'])
            yield p
Beispiel #39
0
    def scrape(self):
        page = self.lxmlize(COUNCIL_PAGE)
        councillors = page.xpath('//div[./h2/a[contains(@href, "/District")]]')

        for councillor in councillors:
            district = re.sub(
                r' ?[–—-] ?', '—', '—'.join(
                    filter(None,
                           (text.replace(',', '').strip()
                            for text in councillor.xpath('./p/text()')))))

            name_elem = councillor.xpath('./p/strong/text()')[0]
            if 'Councillor' in name_elem:
                name = name_elem.strip()[len('Councillor '):]
            else:
                name = name_elem

            if name != 'To be determined':
                photo = councillor.xpath('./p/a/img/@src')[0]

                councillor_page = self.lxmlize(
                    councillor.xpath('./h2/a/@href')[0])
                contact_page_url = councillor_page.xpath(
                    '//li/a[contains(@href, "contact")]/@href')[0]
                contact_page = self.lxmlize(contact_page_url)
                contact_node = contact_page.xpath(
                    '//div[./h1[contains(text(), "Contact")]]')[0]

                phone = self.get_phone(contact_node, area_codes=[902])
                email = self.get_email(contact_node)

                p = Person(primary_org='legislature',
                           name=name,
                           district=district,
                           role='Councillor')
                p.add_source(COUNCIL_PAGE)
                p.add_source(contact_page_url)
                p.add_contact('voice', phone, 'legislature')
                p.add_contact('email', email)
                p.image = photo
                yield p

        mayor_page = self.lxmlize(MAYOR_PAGE, 'iso-8859-1')
        name = ' '.join(mayor_page.xpath(
            '//h2[contains(., "Bio")]/text()')).strip()[:-len(' Bio')]
        contact_page = self.lxmlize(MAYOR_CONTACT_URL, 'iso-8859-1')
        email = self.get_email(contact_page)

        p = Person(primary_org='legislature',
                   name=name,
                   district='Halifax',
                   role='Mayor')
        p.add_source(MAYOR_PAGE)
        p.add_source(MAYOR_CONTACT_URL)
        p.add_contact('email', email)
        yield p
Beispiel #40
0
    def scrape(self):
        index = self.lxmlize(MEMBER_INDEX_URL)
        csv_text = self.get(COUNCIL_PAGE).text
        csv_text = '\n'.join(csv_text.split('\n')[3:])  # discard first 3 rows
        reader = csv.reader(StringIO(csv_text))
        # make unique field names for the two sets of address fields
        field_names = next(reader)
        for name in OFFICE_FIELDS:
            assert(field_names.count(name) == 2)
            field_names[field_names.index(name)] = '{} 1'.format(name)
            field_names[field_names.index(name)] = '{} 2'.format(name)
        rows = [dict(zip(field_names, row)) for row in reader]
        assert len(rows), 'No members found'
        for mla in rows:
            name = '{} {} {}'.format(
                mla['MLA First Name'],
                mla['MLA Middle Names'],
                mla['MLA Last Name'],
            )
            if name.strip() == '':
                continue
            party = get_party(mla['Caucus'])
            name_without_status = name.split(',')[0]
            row_xpath = '//td[normalize-space()="{}"]/..'.format(
                mla['Constituency Name'],
            )
            detail_url, = index.xpath('{}//a/@href'.format(row_xpath))
            photo_url, = index.xpath('{}//img/@src'.format(row_xpath))
            p = Person(
                primary_org='legislature',
                name=name_without_status,
                district=mla['Constituency Name'],
                role='MLA',
                party=party,
                image=photo_url,
            )
            p.add_source(COUNCIL_PAGE)
            p.add_source(detail_url)
            if mla['Email']:
                p.add_contact('email', mla['Email'])
            elif mla.get('MLA Email'):
                p.add_contact('email', mla['MLA Email'])
            assert(mla['Address Type 1'] == 'Legislature Office')
            assert(mla['Address Type 2'] == 'Constituency Office')

            for suffix, note in ((1, 'legislature'), (2, 'constituency')):
                for key, contact_type in (('Phone', 'voice'), ('Fax', 'fax')):
                    value = mla['{} Number {}'.format(key, suffix)]
                    if value and value != 'Pending':
                        p.add_contact(contact_type, value, note)
                address = ', '.join(
                    filter(
                        bool, [
                            mla[
                                '{} {}'.format(field, suffix)
                            ] for field in ADDRESS_FIELDS
                        ]
                    )
                )
                if address:
                    p.add_contact('address', address, note)
            yield p
Beispiel #41
0
    def scrape(self):
        parties = {
            'BC NDP': 'New Democratic Party of British Columbia',
            'BC Liberal Party': 'British Columbia Liberal Party',
        }

        page = self.lxmlize(COUNCIL_PAGE, xml=True)

        nsmap = {'d': 'http://schemas.microsoft.com/ado/2007/08/dataservices'}
        members = page.xpath('//d:Cells', namespaces=nsmap)
        assert len(members), 'No members found'
        for member in members:
            url = member.xpath(
                './d:element/d:Key[text()="Path"]/following-sibling::d:Value/text()',
                namespaces=nsmap)[0]
            page = self.lxmlize(url)

            name = page.xpath(
                '//div[contains(@class, "BCLASS-pagetitle")]//h3/text()'
            )[0].replace('Wm.', '').replace(', Q.C.', '').strip()
            district, party = cleanup_list(
                page.xpath(
                    '//div[@id="MinisterTitle"]/following-sibling::text()'))
            party = parties.get(party, party)
            p = Person(primary_org='legislature',
                       name=name,
                       district=district,
                       role='MLA',
                       party=party)
            p.add_source(COUNCIL_PAGE)
            p.add_source(url)

            p.image = page.xpath('//img[contains(@src, "Members")]/@src')[0]

            email = page.xpath(
                '//div[@class="convertToEmail"]//text()')[0].strip()
            if '#' in email:
                email = email.split('#')[0]
            if email:
                p.add_contact('email', email)

            office = ', '.join(
                cleanup_list(
                    page.xpath(
                        '//h4[contains(text(), "Office:")]/ancestor::div/text()'
                    )))
            office = re.sub(r'\s{2,}', ' ', office)
            p.add_contact('address', office, 'legislature')

            constituency = ', '.join(
                cleanup_list(
                    page.xpath(
                        '//h4[contains(text(), "Constituency:")]/ancestor::div[1]//text()'
                    )))
            constituency = re.sub(r'\s{2,}', ' ',
                                  constituency).split(', Phone')[0]
            p.add_contact('address', constituency, 'constituency')

            phones = cleanup_list(
                page.xpath(
                    '//span[contains(text(), "Phone:")]/following-sibling::text()'
                ))

            office_phone = phones[0]
            p.add_contact('voice', office_phone, 'legislature')
            if len(phones) > 1:
                constituency_phone = phones[1]
                p.add_contact('voice', constituency_phone, 'constituency')

            yield p
Beispiel #42
0
    def scrape(self):
        exclude_divisions = {
            'ocd-division/country:ca/csd:1301006',  # Saint John
            'ocd-division/country:ca/csd:1307022',  # Moncton
            'ocd-division/country:ca/csd:1310032',  # Fredericton
        }
        expected_roles = {
            'Mayor',
            'Councillor',
        }
        unique_roles = {
            'Mayor',
        }
        classifications = {
            'Cities': 'City',
            'Towns': 'Town',
            'Villages': 'Village',
            'Rural Communities': 'Community',
            'Regional Municipality': 'Regional',
        }
        corrections = {
            'Beaubassin-est/East': 'Beaubassin East',
            'Lac-Baker': 'Lac Baker',
            'Saint-François-de-Madawaska': 'Saint-François de Madawaska',
            'Saint-Hilaire': 'Saint Hilaire',
        }
        unknown_names = {
            'Haut-Madawaska',  # incorporated after Census 2016
        }
        duplicate_names = {
            'Denis Savoie',
            'Josée Levesque',
            'Luc Levesque',
        }

        names_to_ids = {}
        for division in Division.get('ocd-division/country:ca').children(
                'csd'):
            type_id = division.id.rsplit(':', 1)[1]
            if type_id.startswith('13'):
                if division.attrs['classification'] == 'P':
                    continue
                if division.name in names_to_ids:
                    raise Exception('unhandled collision: {}'.format(
                        division.name))
                else:
                    names_to_ids[division.name] = division.id

        page = self.lxmlize(COUNCIL_PAGE)
        list_links = page.xpath(
            '//div[@id="sidebar"]//div[contains(@class, "list")][1]//a')

        birth_date = 1900
        seen = set()

        assert len(list_links), 'No list items found'
        for list_link in list_links:
            page = self.lxmlize(list_link.attrib['href'])
            detail_urls = page.xpath('//td[1]//@href')

            assert len(detail_urls), 'No municipalities found'
            for detail_url in detail_urls:
                page = self.lxmlize(detail_url, encoding='utf-8')
                division_name = re.sub(
                    r'\ASt\b\.?', 'Saint',
                    page.xpath('//h1/text()')[0].split(' - ', 1)[1])
                division_name = corrections.get(division_name, division_name)

                if division_name in unknown_names:
                    continue
                division_id = names_to_ids[division_name]
                if division_id in exclude_divisions:
                    continue
                if division_id in seen:
                    raise Exception(
                        'unhandled collision: {}'.format(division_id))

                seen.add(division_id)
                division_name = Division.get(division_id).name
                organization_name = '{} {} Council'.format(
                    division_name, classifications[list_link.text])
                organization = Organization(name=organization_name,
                                            classification='government')
                organization.add_source(detail_url)

                address = ', '.join(
                    page.xpath('//div[@class="left_contents"]/p[1]/text()'))

                contacts = page.xpath(
                    '//div[@class="left_contents"]/p[contains(., "Contact")]/text()'
                )
                phone = contacts[0].split(':')[1]
                if len(contacts) > 1:
                    fax = contacts[1].split(':')[1]
                email = self.get_email(page,
                                       '//div[@class="left_contents"]',
                                       error=False)

                url = page.xpath(
                    '//div[@class="left_contents"]//@href[not(contains(., "mailto:"))]'
                )
                if url:
                    url = url[0]

                groups = page.xpath(
                    '//div[contains(@class, "right_contents")]/p')
                assert len(groups), 'No groups found'
                for p in groups:
                    role = p.xpath('./b/text()')[0].rstrip('s')
                    if role not in expected_roles:
                        raise Exception('unexpected role: {}'.format(role))

                    councillors = p.xpath('./text()')
                    assert len(councillors), 'No councillors found'
                    for seat_number, name in enumerate(councillors, 1):
                        if 'vacant' in name.lower():
                            continue

                        if role in unique_roles:
                            district = division_name
                        else:
                            district = '{} (seat {})'.format(
                                division_name, seat_number)

                        organization.add_post(role=role,
                                              label=district,
                                              division_id=division_id)

                        p = Person(primary_org='government',
                                   primary_org_name=organization_name,
                                   name=name,
                                   district=district,
                                   role=role)
                        p.add_source(COUNCIL_PAGE)
                        p.add_source(list_link.attrib['href'])
                        p.add_source(detail_url)

                        if name in duplicate_names:
                            p.birth_date = str(birth_date)
                            birth_date += 1

                        p.add_contact('address', address, 'legislature')
                        # @see https://en.wikipedia.org/wiki/Area_code_506
                        if phone:
                            p.add_contact('voice',
                                          phone,
                                          'legislature',
                                          area_code=506)
                        if fax:
                            p.add_contact('fax',
                                          fax,
                                          'legislature',
                                          area_code=506)
                        if email:
                            p.add_contact('email', email)
                        if url:
                            p.add_link(url)

                        p._related[0].extras[
                            'boundary_url'] = '/boundaries/census-subdivisions/{}/'.format(
                                division_id.rsplit(':', 1)[1])

                        yield p

                yield organization
Beispiel #43
0
    def scrape_mayor(self, url):
        page = self.lxmlize(url)
        text = page.xpath('//h1//text()[contains(., "Mayor")]')[0]
        if 'Acting Mayor' in text:
            # A councillor is acting mayor. We would need to add two roles to
            # the same person, which can be done with a little effort.
            return

        name = re.sub('(?:Acting )?Mayor ', '', text)

        email = self.get_email(page)
        phone = self.get_phone(page.xpath('//table[1]')[0])

        p = Person(primary_org='legislature', name=name, district='Côte-Saint-Luc', role='Maire')
        p.add_source(COUNCIL_PAGE)
        p.add_source(url)
        p.image = page.xpath('.//div[@class="content"]//img/@src')[0]
        p.add_source(url)
        p.add_contact('email', email)
        p.add_contact('voice', phone, 'legislature')
        return p
Beispiel #44
0
    def scrape(self):
        page = self.lxmlize(COUNCIL_PAGE)

        # it's all javascript rendered on the client... wow.
        js = page.xpath('string(//div[@class="inner_container"]/div/script[2])'
                        )  # allow string()
        districts = re.findall(r'arrayDistricts\[a.+"(.+)"', js)
        names = re.findall(r'arrayMembres\[a.+"(.+)"', js)
        urls = re.findall(r'arrayLiens\[a.+"(.+)"', js)
        # first item in list is mayor
        p = Person(primary_org='legislature',
                   name=names[0],
                   district='Gatineau',
                   role='Maire')
        p.add_source(COUNCIL_PAGE)
        p.add_source(MAYOR_CONTACT_PAGE)
        email = '*****@*****.**'  # hardcoded
        p.add_contact('email', email)
        yield p

        for raw_district, name, url in list(zip(districts, names, urls))[1:]:
            if name == 'Vacant':
                continue

            profile_url = COUNCIL_PAGE + '/' + url.split('/')[-1]
            profile_page = self.lxmlize(profile_url)
            photo_url = profile_page.xpath('//img/@src')[0]
            district = 'District ' + re.search('\d+', raw_district).group(0)
            email = self.get_email(profile_page)
            p = Person(primary_org='legislature',
                       name=name,
                       district=district,
                       role='Conseiller')
            p.add_source(COUNCIL_PAGE)
            p.add_source(profile_url)
            p.image = photo_url
            p.add_contact('email', email)
            yield p
Beispiel #45
0
    def scrape(self):
        page = self.lxmlize(COUNCIL_PAGE)
        table_data = page.xpath('//div[@id="litcontentDiv"]//tr')
        council_data = table_data[2:-1]

        mayor_row = table_data[0]

        photo_url_rel = mayor_row.xpath('string(.//img/@src)')  # can be empty
        photo_url = urljoin(COUNCIL_PAGE, photo_url_rel)
        contact_node = mayor_row.xpath('./td')[1]
        name = contact_node.xpath('.//font[1]/text()')[0]
        email = self.get_email(contact_node)

        p = Person(primary_org='legislature',
                   name=name,
                   district='Sault Ste. Marie',
                   role='Mayor')
        p.add_source(COUNCIL_PAGE)
        p.add_contact('email', email)
        p.image = photo_url
        yield p

        # alternate between a row represneting a ward name and councilors
        for ward_row, data_row in zip(*[iter(council_data)] * 2):
            district = ward_row.xpath('.//text()[contains(., "Ward")]')[0]
            district_num = district_name_using_number(district)
            for councillor_node in data_row.xpath('./td'):
                name = councillor_node.xpath(
                    './/strong/text()|.//font[1]/text()')[0]
                email = self.get_email(councillor_node)
                photo_url_rel = councillor_node.xpath(
                    'string(.//img/@src)')  # can be empty
                photo_url = urljoin(COUNCIL_PAGE, photo_url_rel)
                # address and phone are brittle, inconsistent

                p = Person(primary_org='legislature',
                           name=name,
                           district=district_num,
                           role='Councillor')
                p.add_source(COUNCIL_PAGE)
                if email:
                    p.add_contact('email', email)
                p.image = photo_url

                yield p
Beispiel #46
0
    def scrape(self):
        page = self.lxmlize(COUNCIL_PAGE)

        node = page.xpath('//div[@class="content-field"]/h3[contains(./text(), "Mayor")]/following-sibling::p[2]')[0]
        name = node.xpath('./strong/text()')[0]
        phone = node.xpath('./text()')[2].split(': ')[1]
        fax = node.xpath('./text()')[3].split(': ')[1]
        email = node.xpath('./a/text()')[0]
        image = node.xpath('./preceding::p//img/@src')[0]

        p = Person(primary_org='legislature', name=name, district='Belleville', role='Mayor')
        p.add_source(COUNCIL_PAGE)
        p.add_contact('voice', phone, 'legislature')
        p.add_contact('fax', fax, 'legislature')
        p.add_contact('email', email)
        p.image = image

        yield p

        wards = page.xpath('//h3[contains(text(), "Councillors")]')
        assert len(wards), 'No councillors found'
        for ward in wards:
            ward_name = re.search(r'(Ward.+) Councillors', ward.text).group(1)
            councillors = ward.xpath('./following-sibling::div[1]//strong')
            for councillor in councillors:
                self.seat_numbers[ward_name] += 1
                district = '{} (seat {})'.format(ward_name, self.seat_numbers[ward_name])
                role = 'Councillor'

                name = councillor.text_content()
                phone = councillor.xpath('./following-sibling::text()[2]')[0].split(':')[1]
                email = councillor.xpath('./following-sibling::a//text()')[0]
                image = councillor.xpath('./preceding::img[1]/@src')[0]

                p = Person(primary_org='legislature', name=name, district=district, role=role)
                p.add_source(COUNCIL_PAGE)
                p.add_contact('voice', phone, 'legislature')
                p.add_contact('email', email)
                p.image = image

                yield p
Beispiel #47
0
    def scrape(self):
        response = urlopen(COUNCIL_PAGE).read()
        pdf = open('/tmp/yt.pdf', 'w')
        pdf.write(response)
        pdf.close()

        data = subprocess.check_output(
            ['pdftotext', '-layout', '/tmp/yt.pdf', '-'])
        data = re.split(r'\n\s*\n', data)
        for municipality in data:

            if 'Councillors' not in municipality:
                continue
            lines = municipality.split('\n')
            if 'Page' in lines[0]:
                lines.pop(0)
                if not lines[0].strip():
                    lines.pop(0)
            col1end = re.search(r'\s{2,}(\w)', lines[0].strip()).end()
            col2end = re.search(r':\s{2,}(\w)', lines[0].strip()).end()

            if 'Council' in lines[1]:
                address = lines[2][:col1end -
                                   1].strip() + ' ' + lines[3][:col1end -
                                                               1].strip()
                district = lines[0][:col1end -
                                    1].strip() + ' ' + lines[1][:col1end -
                                                                1].strip()
            else:
                address = lines[1][:col1end -
                                   1].strip() + ' ' + lines[2][:col1end -
                                                               1].strip()
                district = lines[0][:col1end - 1].strip()

            organization = Organization(
                name=district + ' Council',
                classification='legislature',
                jurisdiction_id=self.jurisdiction.jurisdiction_id)
            organization.add_source(COUNCIL_PAGE)
            yield organization

            phone = re.findall(r'(?<=Phone: )\(?(\d{3}[\)-] ?\d{3}-\d{4})',
                               municipality)[0].replace(') ', '-')
            email = re.findall(r'(?<=E-mail:) (\S*)', municipality)[0]
            fax = None
            if 'Fax' in municipality:
                fax = re.findall(r'(?<=Fax: )\(?(\d{3}[\)-] ?\d{3}-\d{4})',
                                 municipality)[0].replace(') ', '-')
            website = None
            if 'Website' in municipality:
                website = re.findall(r'((http:\/\/|www.)(\S*))',
                                     municipality)[0][0]

            councillor_or_mayor = False
            for line in lines:
                if 'Mayor:' in line:
                    councillor_or_mayor = True
                    role = 'Mayor'
                    continue
                if 'Councillors' in line:
                    councillor_or_mayor = True
                    role = 'Councillor'
                    continue
                if councillor_or_mayor:
                    councillor = line[col1end - 1:col2end - 1].strip()
                    if not councillor:
                        continue
                    p = Person(primary_org='legislature',
                               name=councillor,
                               district=district)
                    p.add_source(COUNCIL_PAGE)
                    membership = p.add_membership(organization,
                                                  role=role,
                                                  district=district)
                    membership.add_contact_detail('address', address,
                                                  'legislature')
                    membership.add_contact_detail('voice', phone,
                                                  'legislature')
                    membership.add_contact_detail('email', email)
                    if fax:
                        membership.add_contact_detail('fax', fax,
                                                      'legislature')
                    if website:
                        p.add_link(website)
                    yield p

        os.system('rm /tmp/yt.pdf')
Beispiel #48
0
    def scrape(self):
        regional_councillor_seat_number = 1

        page = self.lxmlize(COUNCIL_PAGE)

        councillors = page.xpath('//div[@id="WebPartWPQ3"]//ul[@class="dfwp-list"][1]/li/div/div/a')
        assert len(councillors), 'No councillors found'
        for councillor in councillors:
            url = councillor.attrib['href']
            page = self.lxmlize(url)

            title = page.xpath('//div[@class="PL_Title"]')[0].text_content()
            if "Councillor" in title:
                district, name = re.split(r'Councillor', title)
                role = 'Councillor'
                if "Regional" in district:
                    role = 'Regional Councillor'
                    district = "Vaughan (seat {})".format(regional_councillor_seat_number)
                    regional_councillor_seat_number += 1
            else:
                name = re.search(r'Mayor ([^,]+)', page.xpath('//meta[@name="keywords"]/@content')[0]).group(1)
                district = 'Vaughan'
                role = 'Mayor'
            name = name.strip()

            if role == 'Mayor':
                detail = self.lxmlize(page.xpath('//a[contains(@href,"/Contact-the-Mayor")]/@href')[0])
                contact_info = detail.xpath('//div[@id="ctl00_PlaceHolderMain_RichHtmlField1__ControlWrapper_RichHtmlField"]')[0]
            else:
                contact_node = page.xpath('//div[@id="WebPartWPQ2"][contains(., "Phone")]')
                if contact_node:
                    contact_info = contact_node[0]
                else:
                    contact_info = page.xpath('//div[@id="WebPartWPQ3"]')[0]

            phone = re.findall(r'[0-9]{3}-[0-9]{3}-[0-9]{4} ext\. [0-9]{4}', contact_info.text_content())[0].replace('ext. ', 'x')
            fax = re.findall(r'[0-9]{3}-[0-9]{3}-[0-9]{4}', contact_info.text_content())[1]
            email = self.get_email(contact_info)

            p = Person(primary_org='legislature', name=name, district=district.strip(), role=role)
            p.add_source(COUNCIL_PAGE)
            p.add_source(url)
            p.add_contact('voice', phone, 'legislature')
            p.add_contact('fax', fax, 'legislature')
            p.add_contact('email', email)

            image = page.xpath('//img[contains(@alt, "Councillor")]/@src')
            if image:
                p.image = image[0]

            if page.xpath('.//a[contains(@href,"facebook")]'):
                p.add_link(page.xpath('.//a[contains(@href,"facebook")]')[0].attrib['href'])
            if page.xpath('.//a[contains(@href,"twitter")]'):
                p.add_link(page.xpath('.//a[contains(@href,"twitter")]')[0].attrib['href'])
            if page.xpath('.//a[contains(@href,"youtube")]'):
                p.add_link(page.xpath('.//a[contains(@href, "youtube")]')[0].attrib['href'])
            yield p
Beispiel #49
0
 def scrape(self):
     csv_text = self.get(self.get_csv_url()).text
     rows = [row for row in csv.DictReader(StringIO(csv_text))]
     assert len(rows), 'No members found'
     for mla in rows:
         name = '{} {} {}'.format(mla['MLA First Name'],
                                  mla['MLA Middle Names'],
                                  mla['MLA Last Name'])
         if name.strip() == '':
             continue
         party = get_party(mla['Caucus'])
         name_without_status = name.split(',')[0]
         detail_url = ('http://www.assembly.ab.ca/net/index.aspx?'
                       'p=mla_contact&rnumber={0}&leg=29'.format(
                           mla['Riding Number']))
         detail_page = self.lxmlize(detail_url)
         photo_url = detail_page.xpath('//img[@class="MemPhoto"]/@src')[0]
         p = Person(
             primary_org='legislature',
             name=name_without_status,
             district=mla['Riding Name'],
             role='MLA',
             party=party,
             image=photo_url,
         )
         p.add_source(COUNCIL_PAGE)
         p.add_source(detail_url)
         if mla['Email']:
             p.add_contact('email', mla['Email'])
         elif mla.get('MLA Email'):
             p.add_contact('email', mla['MLA Email'])
         if mla['Phone Number']:
             p.add_contact('voice', mla['Phone Number'], 'legislature')
         yield p
Beispiel #50
0
    def scrape(self):
        page = self.lxmlize(COUNCIL_PAGE, 'utf-8')
        councillors = page.xpath('//div[@class="member-box member-box--gray"]')
        assert len(councillors), 'No councillors found'
        for councillor_elem in councillors:
            name = councillor_elem.xpath(
                './/div[@class="fiche__name"]/text()')[0]
            district = councillor_elem.xpath(
                './/div[@class="fiche__category"]/text()')[0]
            phone = councillor_elem.xpath(
                './/div[@class="fiche__social"]/span/text()')[0].split('T')[1]
            email_mailto = councillor_elem.xpath(
                './/div[@class="fiche__social"]/a[contains(@href, "mailto")]/@href'
            )
            photo_url = councillor_elem.xpath('.//img')[0].attrib['src']

            p = Person(primary_org='legislature',
                       name=name,
                       district=district,
                       role='Conseiller',
                       image=photo_url)
            p.add_source(COUNCIL_PAGE)
            p.add_contact('voice', phone, 'legislature')
            if email_mailto:
                email = email_mailto[0].split('mailto:')[1]
                p.add_contact('email', email)
            yield p

        mayor_elem = page.xpath(
            '//div[@class="member-box member-box--main"]')[0]
        name = mayor_elem.xpath('.//div[@class="fiche__name"]/text()')[0]
        phone = mayor_elem.xpath(
            './/div[@class="fiche__social"]/span/text()')[0].split('T')[1]
        email_mailto = mayor_elem.xpath(
            './/div[@class="fiche__social"]/a[contains(@href, "mailto")]/@href'
        )
        photo_url = councillor_elem.xpath('.//img')[0].attrib['src']
        p = Person(primary_org='legislature',
                   name=name,
                   district='Terrebonne',
                   role='Maire',
                   image=photo_url)
        p.add_source(COUNCIL_PAGE)
        p.add_contact('voice', phone, 'legislature')
        if email_mailto:
            email = email_mailto[0].split('mailto:')[1]
            p.add_contact('email', email)
        yield p
Beispiel #51
0
    def scrape(self):
        page = self.lxmlize(COUNCIL_PAGE)
        members = page.xpath('//*[@id="ListeDeputes"]/tbody/tr')

        assert len(members), 'No members found'
        for row in members:
            name_comma, division = [cell.text_content() for cell in row[:2]]
            name = ' '.join(reversed(name_comma.strip().split(',')))
            party = row[2].text_content()
            email = self.get_email(row[3], error=False)
            detail_url = row[0][0].attrib['href']
            detail_page = self.lxmlize(detail_url)
            photo_url = detail_page.xpath('//img[@class="photoDepute"]/@src')[0]
            division = division.replace('–', '—')  # n-dash, m-dash
            p = Person(primary_org='legislature', name=name, district=division, role='MNA',
                       party=party, image=photo_url)
            p.add_source(COUNCIL_PAGE)
            p.add_source(detail_url)
            if email:
                p.add_contact('email', email)
            contact_url = detail_url.replace('index.html', 'coordonnees.html')
            contact_page = self.lxmlize(contact_url)
            p.add_source(contact_url, note='For telephone number(s)')
            for div in contact_page.xpath('//div[@class="blockAdresseDepute"]'):
                try:
                    phone = self.get_phone(div)
                    heading = div.find('h3').text
                except Exception:
                    pass  # probably just no phone number present
                else:
                    try:
                        note = {
                            'Circonscription': 'constituency',
                            'Parlement': 'legislature',
                            'Ministère': 'legislature',
                        }[heading]
                    except KeyError:
                        raise  # scraper should be updated to handle new value
                    else:
                        p.add_contact('voice', phone, note)
            yield p
Beispiel #52
0
    def scrape(self):
        page = self.lxmlize(COUNCIL_PAGE)

        councillors = page.xpath('//table[@id="Main Content"]//td[@colspan="3"]//td/p/b')
        for councillor in councillors:
            district, name = councillor.xpath('./text()')[0].split(':')
            if 'Mayor' in district:
                yield scrape_mayor(councillor, name)
                continue

            p = Person(primary_org='legislature', name=name, district=district, role='Councillor')
            p.add_source(COUNCIL_PAGE)

            base_info = councillor.xpath('./parent::p/text()')
            for info in councillor.xpath('./parent::p/following-sibling::p'):
                if info.xpath('.//b'):
                    break
                base_info = base_info + info.xpath('./text()')

            address = ''
            complete = False
            while not complete:
                address = address + ' ' + base_info.pop(0)
                if re.search(r'[A-Z][0-9A-Z][A-Z] \d[A-Z]\d', address):
                    complete = True
            p.add_contact('address', address, 'legislature')

            base_info.pop(-1)
            base_info = ' '.join(base_info).split()
            for i, contact in enumerate(base_info):
                if re.match(r'[0-9]', contact):
                    continue
                if 'fax' in contact:
                    p.add_contact('fax', base_info[i + 1], 'legislature')
                else:
                    p.add_contact(contact, base_info[i + 1], contact)
            email = self.get_email(councillor, './parent::p/following-sibling::p')
            p.add_contact('email', email)
            yield p
Beispiel #53
0
    def scrape(self):
        root = self.lxmlize(COUNCIL_PAGE)
        everyone = root.xpath('//span[@class="Title"]')
        mayornode = everyone[0]
        mayor = {}
        spantext = ' '.join(mayornode.xpath('.//text()'))
        mayor['name'] = re.search(r'[^(]+', spantext).group(0).strip()
        mayor['photo_url'] = urljoin(COUNCIL_PAGE,
                                     mayornode.xpath('img/@src')[0])
        mayor['email'] = mayornode.xpath('following::a[1]/text()')[0]

        m = Person(primary_org='legislature',
                   name=mayor['name'],
                   district='Charlottetown',
                   role='Mayor')
        m.add_source(COUNCIL_PAGE)
        m.add_contact('email', mayor['email'])
        m.image = mayor['photo_url']

        yield m

        councillors = root.xpath('//span[@class="Title"]')[1:]
        assert len(councillors), 'No councillors found'
        for span in councillors:
            spantext = ' '.join(span.xpath('.//text()'))
            header = spantext.replace('\u2013', '-').replace('\x96',
                                                             '-').split('-')
            if len(header) != 2:
                continue

            name = header[0].strip()
            name = name.replace('Councillor', '')
            name = re.sub(r'\(.+?\)', '', name)
            name = ' '.join(name.split())

            district_id = ' '.join(header[1].split()[:2])

            # needed a wacky xpath to deal with ward 8
            photo = span.xpath('preceding::hr[1]/following::img[1]/@src')
            photo_url = urljoin(COUNCIL_PAGE, photo[0])

            email = span.xpath(
                'string(following::a[1]/text())')  # can be empty

            p = Person(primary_org='legislature',
                       name=name,
                       district=district_id,
                       role='Councillor')
            p.add_source(COUNCIL_PAGE)
            if email:
                p.add_contact('email', email)
            p.image = photo_url

            yield p
Beispiel #54
0
    def scrape(self):
        regional_councillor_seat_number = 1

        yield self.mayor_info(MAYOR_PAGE)

        page = self.lxmlize(COUNCIL_PAGE)
        councillors = page.xpath('//div[@id="news"]//p')
        for councillor in councillors:
            district = councillor.xpath('./b')[0].text_content()
            district = re.findall('(?:W|R).*', district)[0]
            role = 'Councillor'
            if 'Regional' in district:
                role = 'Regional Councillor'
                district = 'Cambridge (seat {})'.format(
                    regional_councillor_seat_number)
                regional_councillor_seat_number += 1
            name = councillor.xpath('.//a')[0].text_content()

            url = councillor.xpath('.//a')[0].attrib['href']
            page = self.lxmlize(url)

            image = page.xpath(
                '//img[contains(@src, "councilImages")]/@src')[0]
            address = page.xpath('//*[contains(text(),"Address")]/ancestor::td'
                                 )[-1].text_content().split(':')[-1].replace(
                                     "\t", '')
            phone = page.xpath('//*[contains(text(),"Tel")]/ancestor::td'
                               )[-1].text_content().split(':')[-1].replace(
                                   "\t", '')
            phone = phone.replace('(', '').replace(') ', '-')
            if page.xpath('//*[contains(text(),"Fax")]'):
                fax = page.xpath('//*[contains(text(),"Fax")]/ancestor::td'
                                 )[-1].text_content().split(':')[-1].replace(
                                     "\t", '')
                fax = fax.replace('(', '').replace(') ', '-')
            email = self.get_email(page)

            p = Person(primary_org='legislature',
                       name=name,
                       district=district,
                       role=role)
            p.add_source(COUNCIL_PAGE)
            p.add_source(url)
            p.add_contact('address', address, 'legislature')
            p.add_contact('voice', phone, 'legislature')
            p.add_contact('fax', fax, 'legislature')
            p.add_contact('email', email)
            p.image = image
            yield p
Beispiel #55
0
    def scrape(self):
        page = self.lxmlize(COUNCIL_PAGE)

        councillors = page.xpath(
            '//div[@id="ctl00_ContentPlaceHolder1_ContentBlock1"]//a/parent::p'
        )
        for councillor in councillors:
            if not councillor.text_content().strip():
                continue
            if 'Mayor' in councillor.text_content():
                name = councillor.text_content().replace('Mayor ', '')
                district = 'Haldimand County'
                role = 'Mayor'
            else:
                district, name = councillor.text_content().split(' - ')
                name = name.replace('Councillor', '').strip()
                district = district.strip()
                role = 'Councillor'

            url = councillor.xpath('.//a')[0].attrib['href']
            page = self.lxmlize(url)

            p = Person(primary_org='legislature',
                       name=name,
                       district=district,
                       role=role)
            p.add_source(COUNCIL_PAGE)
            p.add_source(url)

            p.image = page.xpath(
                '//div[@id="ctl00_ContentPlaceHolder1_ContentBlock1"]//tr[1]/td//img/@src'
            )[0]

            info = page.xpath(
                '//a[contains(@href, "mailto:")]/parent::*/text()')
            for i, field, in enumerate(info):
                if re.match(r'[0-9]+ [A-Z]', field):
                    address = field + ', ' + info[i + 1] + ', ' + info[i + 2]
                    p.add_contact('address', address, 'legislature')
                if re.findall(r'[0-9]{3} [0-9]{3} [0-9]{4}', field):
                    if 'Fax' in field:
                        num = field.replace('Fax: ',
                                            '').strip().replace(' ', '-')
                        p.add_contact('fax', num, 'legislature')
                    else:
                        num = field.replace('Telephone: ',
                                            '').strip().replace(' ', '-')
                        p.add_contact('voice', num, 'legislature')
            email = self.get_email(page)
            p.add_contact('email', email)
            yield p
Beispiel #56
0
    def scrape(self):
        seat_numbers = defaultdict(int)

        page = self.lxmlize(COUNCIL_PAGE)

        mayor_url = page.xpath('//li[@id="pageid1075"]/div/a/@href')[0]
        yield self.scrape_mayor(mayor_url)

        wards = page.xpath('//div[@id="content"]//h3')
        for ward in wards:
            ward_name = ward.text_content()
            councillor_links = ward.xpath('./following-sibling::p[1]/a')

            assert len(councillor_links
                       ), 'No councillors found for ward {}'.format(ward_name)
            for councillor_link in councillor_links:
                name = councillor_link.text

                if ward_name in ('Ward 1', 'Ward 2'):
                    seat_numbers[ward_name] += 1
                    district = '{} (seat {})'.format(ward_name,
                                                     seat_numbers[ward_name])
                else:
                    district = ward_name

                p = Person(primary_org='legislature',
                           name=name,
                           district=district,
                           role='Councillor')
                url = councillor_link.attrib['href']
                p.add_source(COUNCIL_PAGE)
                p.add_source(url)
                cpage = self.lxmlize(url)
                image_url_rel = cpage.xpath(
                    '//div[@id="content"]//img[contains(@alt, "Councillor")]/@src'
                )[0]
                image_url = urljoin(url, image_url_rel)
                p.image = image_url

                contacts = page.xpath(
                    '//div[@id="content"]//div[@class="block"]/text()')
                for contact in contacts:
                    if not re.search(r'[0-9]', contact):
                        continue
                    if '(' not in contact:
                        contact_type = 'T'
                    else:
                        contact_type, contact = contact.split('(')
                    contact = contact.replace(') ', '-').strip()
                    if 'T' in contact_type:
                        p.add_contact('voice', contact, 'legislature')
                    if 'H' in contact_type:
                        p.add_contact('voice', contact, 'residence')
                    if 'C' in contact_type:
                        p.add_contact('cell', contact, 'legislature')
                    if 'F' in contact_type:
                        p.add_contact('fax', contact, 'legislature')
                email = self.get_email(
                    cpage, '//div[@id="content"]//div[@class="block"]')
                p.add_contact('email', email)
                yield p
Beispiel #57
0
    def scrape(self):
        page = self.lxmlize(COUNCIL_PAGE)

        councillors = page.xpath(
            '//table[@cellpadding="4"]//td//a[text()!=""]/@href')
        for councillor in councillors:
            page = self.lxmlize(councillor)
            # Hon. is followed by Dr. in one case but the clean_name function
            # removes only one honorific title
            name = page.xpath(
                '//h2[contains(text(), "MLA:")]')[0].text_content().replace(
                    'MLA:', '').replace('Dr.',
                                        '').replace(', Q.C.',
                                                    '').replace('Wm.',
                                                                '').strip()
            district, party = cleanup_list(
                page.xpath(
                    '//h2/following-sibling::div[1]/div[2]/div[1]/div/text()'))
            p = Person(primary_org='legislature',
                       name=name,
                       district=district,
                       role='MLA',
                       party=party)
            p.add_source(COUNCIL_PAGE)
            p.add_source(councillor)

            p.image = page.xpath('//img[contains(@src, "Members")]/@src')[0]

            email = page.xpath(
                '//span[@class="convertToEmail"]//text()')[0].strip()
            if '#' in email:
                email = email.split('#')[0]
            if email:
                p.add_contact('email', email)

            office = ', '.join(
                cleanup_list(
                    page.xpath(
                        '//h4[contains(text(), "Office:")]/ancestor::div/text()'
                    )))
            office = re.sub(r'\s{2,}', ' ', office)
            p.add_contact('address', office, 'legislature')

            constituency = ', '.join(
                cleanup_list(
                    page.xpath(
                        '//h4[contains(text(), "Constituency:")]/ancestor::div[1]//text()'
                    )))
            constituency = re.sub(r'\s{2,}', ' ',
                                  constituency).split(', Phone')[0]
            p.add_contact('address', constituency, 'constituency')

            phones = cleanup_list(
                page.xpath(
                    '//span[contains(text(), "Phone:")]/following-sibling::text()'
                ))
            faxes = cleanup_list(
                page.xpath(
                    '//span[contains(text(), "Fax:")]/following-sibling::span[1]/text()'
                ))

            office_phone = phones[0]
            p.add_contact('voice', office_phone, 'legislature')
            if len(phones) > 1:
                constituency_phone = phones[1]
                p.add_contact('voice', constituency_phone, 'constituency')
            office_fax = faxes[0]
            p.add_contact('fax', office_fax, 'legislature')
            if len(faxes) > 1:
                constituency_fax = faxes[1]
                p.add_contact('fax', constituency_fax, 'constituency')

            yield p
Beispiel #58
0
    def scrape_mayor(self, url):
        page = self.lxmlize(url)
        name = page.xpath('//meta[@name="description"]/@content')[0].split(
            ',')[1]

        p = Person(primary_org='legislature',
                   name=name,
                   district='Moncton',
                   role='Mayor')
        p.add_source(url)

        p.image = page.xpath('//div[@id="content"]/p[1]/img/@src')[0]

        info = page.xpath('//table[@class="whiteroundedbox"]//tr[2]/td[1]')[1]
        address = ', '.join(info.xpath('./p[1]/text()')[1:4])
        address = re.sub(r'\s{2,}', ' ', address).strip()
        phone = info.xpath('.//p[2]/text()')[0].split(':')[1].strip()
        fax = info.xpath('.//p[2]/text()')[1].split(':')[1].strip()
        email = self.get_email(info)

        p.add_contact('address', address, 'legislature')
        if len(re.sub(r'\D', '', phone)) == 7:
            phone = '506-{}'.format(phone)
        p.add_contact('voice', phone, 'legislature')
        p.add_contact('fax', fax, 'legislature')
        p.add_contact('email', email)

        return p
Beispiel #59
0
    def scrape(self):
        # https://winnipeg.ca/council/wards/includes/wards.js
        # var COUNCIL_API = 'https://data.winnipeg.ca/resource/r4tk-7dip.json';
        api_url = 'https://data.winnipeg.ca/resource/r4tk-7dip.json'
        data = json.loads(requests.get(api_url).content)

        page = self.lxmlize(COUNCIL_PAGE, 'utf-8')

        councillors = page.xpath('//div[@class="box"]')
        assert len(councillors), 'No councillors found'
        for councillor in councillors:
            role = councillor.xpath(
                './/div[@class="insideboxtitle"]/text()')[0].strip()
            name = councillor.xpath('.//p[@class="insideboxtext"]/text()')[0]
            image = councillor.xpath('.//@src')[0]

            if 'Councillor' in name:
                role = 'Councillor'
                name = name.replace('Councillor ', '')

            url = api_url
            item = next(
                (item for item in data
                 if item['person'] == name and item['current_council']), None)
            if item is None:
                raise Exception(name)

            district = item['name_english'].replace(' - ',
                                                    '—')  # hyphen, m-dash

            email = item['email_link']
            voice = item['phone']
            fax = item['fax']

            p = Person(primary_org='legislature',
                       name=name,
                       district=district,
                       role=role)
            p.add_source(COUNCIL_PAGE)
            p.add_source(url)

            if not image.endswith('nophoto.jpg'):
                p.image = image
            p.add_contact('email', parse_email(email))
            p.add_contact('voice', voice, 'legislature')
            p.add_contact('fax', fax, 'legislature')

            yield p
Beispiel #60
0
    def scrape(self):
        exclude_divisions = {}
        exclude_districts = {
            'Capital',
            'Capital F',
            'Capital G',
            'Capital H',
            'Central Coast B',
            'Central Okanagan East',
            'Central Okanagan West',
            'Comox Valley B',
            'Comox Valley C',
            'Islands Trust',
            'Kitimat-Stikine C',
            'Kootenay Boundary B',
            'Kootenay Boundary C',
            'Kootenay Boundary D',
            'Kootenay Boundary E',
            'Metro Vancouver A',
            'North Coast A',
            'North Coast C',
            'North Coast D',
            'North Coast E',
            'Okanagan-Similkameen I',
            'Okanagan-Similkameen Olalla Local Community Commission',
            'Qathet A',
            'Qathet B',
            'Qathet C',
            'Qathet D',
            'Qathet E',
        }
        expected_roles = {
            'candidate',
        }
        infixes = {
            'CY': 'City',
            'DM': 'District',
            'IGD': 'District',
            'IM': 'Municipal',
            'RGM': 'Regional',
            'T': 'Town',
            'VL': 'Village',
            'RDA': 'District',
        }
        duplicate_names = {
            'Rick Smith',
            'Sung Y Wong',
            'Elizabeth Taylor',
        }

        names_to_ids = {}
        for division in Division.get('ocd-division/country:ca').children(
                'csd'):
            type_id = division.id.rsplit(':', 1)[1]
            if type_id.startswith('59'):
                if division.attrs['classification'] == 'IRI':
                    continue
                if division.name in names_to_ids:
                    names_to_ids[division.name] = None
                else:
                    names_to_ids[division.name] = division.id

        reader = self.csv_reader(COUNCIL_PAGE, header=True)
        reader.fieldnames = [field.lower() for field in reader.fieldnames]

        organizations = {}

        birth_date = 1900
        seen = set()

        for row in reader:
            name = row['full name']
            district_name = row['district name']

            if not any(row.values()) or name.lower() in (
                    '', 'vacant') or district_name in exclude_districts:
                continue

            if row['district id']:
                division_id = 'ocd-division/country:ca/csd:{}'.format(
                    row['district id'])
            else:
                division_id = names_to_ids[row['district name']]

            if division_id in exclude_divisions:
                continue
            if not division_id:
                raise Exception('unhandled collision: {}'.format(
                    row['district name']))

            division = Division.get(division_id)

            division_name = division.name

            organization_name = '{} {} Council'.format(
                division_name, infixes[division.attrs['classification']])

            if division_id not in seen:
                seen.add(division_id)
                organizations[division_id] = Organization(
                    name=organization_name, classification='government')
                organizations[division_id].add_source(COUNCIL_PAGE)

            organization = organizations[division_id]

            role = row['primary role']
            if role not in expected_roles:
                raise Exception('unexpected role: {}'.format(role))
            if row['district id']:
                district = format(division_id)
            else:
                district = division_name

            organization.add_post(role=role,
                                  label=district,
                                  division_id=division_id)

            p = Person(primary_org='government',
                       primary_org_name=organization_name,
                       name=name,
                       district=district,
                       role=role)
            p.add_source(COUNCIL_PAGE)
            if row['source url']:
                p.add_source(row['source url'])

            if name in duplicate_names:
                p.birth_date = str(birth_date)
                birth_date += 1

            if row['email']:
                p.add_contact('email', row['email'])

            if row['phone']:
                p.add_contact('voice', row['phone'], 'legislature')

            if row['twitter']:
                p.add_link(row['twitter'])

            p._related[0].extras[
                'boundary_url'] = '/boundaries/census-subdivisions/{}/'.format(
                    division_id.rsplit(':', 1)[1])

            yield p

        for organization in organizations.values():
            yield organization