コード例 #1
0
ファイル: people.py プロジェクト: fchagnon/scrapers-ca
    def get_people(self):
        page = lxmlize(COUNCIL_PAGE)
        councillor_links = page.xpath('//li[@id="pageid2117"]/ul/li/a')[2:10]
        for link in councillor_links:
            if not link.text.startswith('Councillor'):
                continue
            url = link.attrib['href']
            page = lxmlize(url)
            mail_link = page.xpath('//a[@title]')[0]
            name = mail_link.attrib['title']
            email = mail_link.attrib['href'][len('mailto:'):]
            photo_url = page.xpath(
                'string(//div[@class="pageContent"]//img[@align="right"]/@src)'
            )
            p = Legislator(name=name,
                           post_id='Abbotsford',
                           role='Councillor',
                           image=photo_url)
            p.add_source(url)
            p.add_contact('email', email, None)
            yield p

        page = lxmlize(MAYOR_URL)
        name = page.xpath('string(//h1)').split(' ', 1)[1]
        photo_url = page.xpath('string(//img[@hspace=10]/@src)')
        # email is hidden behind a form
        p = Legislator(name=name,
                       post_id='Abbotsford',
                       role='Mayor',
                       image=photo_url)
        p.add_source(MAYOR_URL)
        yield p
コード例 #2
0
    def get_people(self):
        # mayor first, can't find email
        page = lxmlize(MAYOR_URL)
        photo_url = page.xpath('string(//img/@src[contains(., "Maire")])')
        name = page.xpath('string(//td[@class="contenu"]/text()[last()])')
        p = Legislator(name=name,
                       post_id=u"Trois-Rivières",
                       role="Maire",
                       image=photo_url)
        p.add_source(MAYOR_URL)
        yield p

        resp = requests.get(COUNCIL_PAGE)
        # page rendering through JS on the client
        page_re = re.compile(r'createItemNiv3.+"District (.+?)".+(index.+)\\"')
        for district, url_rel in page_re.findall(resp.text):
            if district not in ('des Estacades', 'des Plateaux',
                                'des Terrasses', 'du Sanctuaire'):
                district = re.sub('\A(?:de(?: la)?|des|du) ', '', district)

            url = urljoin(COUNCIL_PAGE, url_rel)
            page = lxmlize(url)
            name = page.xpath('string(//h2)')
            email = page.xpath(
                'string(//a/@href[contains(., "mailto:")])')[len('mailto:'):]
            photo_url = page.xpath(
                'string(//img/@src[contains(., "Conseiller")])')
            p = Legislator(name=name,
                           post_id=district,
                           role='Conseiller',
                           image=photo_url)
            p.add_source(url)
            p.add_contact('email', email, None)
            yield p
コード例 #3
0
ファイル: people.py プロジェクト: fchagnon/scrapers-ca
  def get_people(self):
    page = lxmlize(COUNCIL_PAGE, 'iso-8859-1')
    nodes = page.xpath('//table[@width="484"]//tr')
    try:
      for district_row, councillor_row, contact_row, _ in chunks(nodes, 4):
        post_id = district_row.xpath('string(.//strong)')
        name = councillor_row.xpath('string(.)')[len('Councillor '):]
        # TODO: phone numbers on site don't include area code. Add manually?
        #phone = contact_row.xpath('string(td[2]/text())')
        email = contact_row.xpath('string(td[4]/a)').replace('[at]', '@')

        p = Legislator(name=name, post_id=post_id, role='Councillor')
        p.add_source(COUNCIL_PAGE)
        #p.add_contact('voice', phone, 'legislature')
        p.add_contact('email', email, None)
        yield p
    except ValueError:
      # on the last run through, there will be less than 4 rows to unpack
      pass

    mayor_page = lxmlize(MAYOR_PAGE, 'iso-8859-1')
    name = mayor_page.xpath('string(//h1[contains(., "Bio")])')[:-len(' Bio')]
    contact_page = lxmlize(MAYOR_CONTACT_URL, 'iso-8859-1')
    email = contact_page.xpath('string(//a[contains(., "@")][1])')

    p = Legislator(name=name, post_id='Halifax', role='Councillor')
    p.add_source(MAYOR_PAGE)
    p.add_source(MAYOR_CONTACT_URL)
    p.add_contact('email', email, None)
    yield p
コード例 #4
0
ファイル: people.py プロジェクト: fchagnon/scrapers-ca
  def get_people(self):

    page = lxmlize(COUNCIL_PAGE)

    councillors = page.xpath('//p[@class="WSIndent"]/a')
    for councillor in councillors:
      district = re.findall(r'(Ward [0-9]{1,2})', councillor.text_content())
      if district:
        district = district[0]
        name = councillor.text_content().replace(district, '').strip()
        role = 'Councillor'
      else:
        district = 'Kawartha Lakes'
        name = councillor.text_content().replace('Mayor', '').strip()
        role = 'Mayor'

      url = councillor.attrib['href']
      page = lxmlize(url)
      email = page.xpath('//a[contains(@href, "mailto:")]/@href')[0].rsplit(':', 1)[1].strip()
      image = page.xpath('//img[@class="image-right"]/@src')[0]

      p = Legislator(name=name, post_id=district, role=role)
      p.add_source(COUNCIL_PAGE)
      p.add_source(url)
      p.add_contact('email', email, None)
      p.image = image
      yield p
コード例 #5
0
ファイル: people.py プロジェクト: fchagnon/scrapers-ca
  def get_people(self):
    page = lxmlize(COUNCIL_PAGE)

    councillors = page.xpath('//div[@id="subnav"]//a')
    for councillor in councillors:
      name = councillor.xpath('./span/text()')[0].strip()
      district = councillor.xpath('.//strong')[0].text_content()

      url = councillor.attrib['href']

      if councillor == councillors[0]:
        yield self.scrape_mayor(name, url)
        continue

      page = lxmlize(url)

      address = page.xpath('//div[@id="content"]//p[contains(text(),"City of Burlington,")]')
      contact = page.xpath('//div[@id="subnav"]//p[contains(text(),"Phone")]')[0]
      phone = re.findall(r'Phone: (.*)', contact.text_content())[0].replace('Ext. ', 'x').replace('#', 'x')
      fax = re.findall(r'Fax: (.*)', contact.text_content())[0]
      email = contact.xpath('//a[contains(@href, "mailto:")]')[0].text_content()

      p = Legislator(name=name, post_id=district, role='Councillor')
      p.add_source(COUNCIL_PAGE)
      p.add_source(url)

      p.image = page.xpath('//div[@id="subnav"]//img/@src')[0]

      if address:
        p.add_contact('address', address[0].text_content(), 'legislature')
      p.add_contact('voice', phone, 'legislature')
      p.add_contact('fax', fax, 'legislature')
      p.add_contact('email', email, None)

      yield p
コード例 #6
0
    def get_people(self):

        page = lxmlize(COUNCIL_PAGE)

        councillors = page.xpath('//p[@class="WSIndent"]/a')
        for councillor in councillors:
            district = re.findall(r'(Ward [0-9]{1,2})',
                                  councillor.text_content())
            if district:
                district = district[0]
                name = councillor.text_content().replace(district, '').strip()
                role = 'Councillor'
            else:
                district = 'Kawartha Lakes'
                name = councillor.text_content().replace('Mayor', '').strip()
                role = 'Mayor'

            url = councillor.attrib['href']
            page = lxmlize(url)
            email = page.xpath(
                '//a[contains(@href, "mailto:")]/@href')[0].rsplit(
                    ':', 1)[1].strip()
            image = page.xpath('//img[@class="image-right"]/@src')[0]

            p = Legislator(name=name, post_id=district, role=role)
            p.add_source(COUNCIL_PAGE)
            p.add_source(url)
            p.add_contact('email', email, None)
            p.image = image
            yield p
コード例 #7
0
ファイル: people.py プロジェクト: fchagnon/scrapers-ca
  def scrape_mayor(self, name, url):
    page = lxmlize(url)

    contact = page.xpath('//div[@id="secondary align_RightSideBar"]/blockquote/p/text()')
    phone = contact[0]
    fax = contact[1]
    email = page.xpath('//div[@id="secondary align_RightSideBar"]/blockquote/p/a[contains(@href, "mailto:")]/text()')[0]

    mayor_page = lxmlize('http://www.burlingtonmayor.com')
    contact_url = mayor_page.xpath('//div[@class="menu"]//a[contains(text(),"Contact")]')[0].attrib['href']
    mayor_page = lxmlize(contact_url)
    address = mayor_page.xpath('//div[@class="entry-content"]//p[contains(text(),"City Hall")]')[0].text_content()

    p = Legislator(name=name, post_id="Burlington", role='Mayor')
    p.add_source(COUNCIL_PAGE)
    p.add_source(url)
    p.add_source('http://www.burlingtonmayor.com')

    p.image = page.xpath('//div[@id="secondary align_RightSideBar"]/p/img/@src')[0]
    p.add_contact('voice', phone, 'legislature')
    p.add_contact('fax', fax, 'legislature')
    p.add_contact('email', email, None)
    p.add_contact('address', address, 'legislature')

    return p
コード例 #8
0
ファイル: people.py プロジェクト: fchagnon/scrapers-ca
    def scrape_mayor(self, url):
        page = lxmlize(url)
        name = page.xpath("//h1/text()")[0].replace("Toronto Mayor",
                                                    "").strip()

        p = Legislator(name, post_id="Toronto", role='Mayor')
        p.add_source(COUNCIL_PAGE)
        p.add_source(url)

        p.image = page.xpath('string(//article/img/@src)')

        url = page.xpath(
            '//a[contains(text(), "Contact the Mayor")]')[0].attrib['href']
        url = url.replace(
            'www.', 'www1.'
        )  # @todo fix lxmlize to use the redirected URL to make links absolute
        p.add_source(url)
        page = lxmlize(url)

        mail_elem, phone_elem = page.xpath('//h3')[:2]
        address = ''.join(mail_elem.xpath('./following-sibling::p//text()'))
        phone = phone_elem.xpath('string(./following-sibling::p[1])')

        p.add_contact('address', address, 'legislature')
        p.add_contact('voice', phone, 'legislature')
        return p
コード例 #9
0
ファイル: people.py プロジェクト: fchagnon/scrapers-ca
    def get_people(self):
        member_parties = dict(process_parties(lxmlize(PARTY_PAGE)))

        page = lxmlize(COUNCIL_PAGE)
        for row in page.xpath('//table[not(@id="footer")]/tr')[1:]:
            name, district, _, email = [
                cell.xpath('string(.)').replace(u'\xa0', u' ') for cell in row
            ]
            phone = row[2].xpath('string(text()[1])')
            try:
                photo_page_url = row[0].xpath('./a/@href')[0]
            except IndexError:
                continue  # there is a vacant district
            photo_page = lxmlize(photo_page_url)
            photo_url = photo_page.xpath('string(//table//img/@src)')
            district = district.replace(' - ', u'—')  # m-dash
            party = get_party(member_parties[name.strip()])
            p = Legislator(name=name,
                           post_id=district,
                           role='MHA',
                           party=party,
                           image=photo_url)
            p.add_source(COUNCIL_PAGE)
            p.add_source(photo_page_url)
            p.add_contact('email', email, None)
            # TODO: either fix phone regex or tweak phone value
            p.add_contact('voice', phone, 'legislature')
            yield p
コード例 #10
0
ファイル: people.py プロジェクト: fchagnon/scrapers-ca
  def get_people(self):
    yield mayor_info(MAYOR_PAGE)

    page = lxmlize(COUNCIL_PAGE)
    councillors = page.xpath('//div[@id="news"]//p')
    for councillor in councillors:
      district = councillor.xpath('./b')[0].text_content()
      district = re.findall(u'(?:W|R).*', district)[0]
      role = 'Councillor'
      if 'Regional' in district:
        district = 'Cambridge'
        role = 'Regional Councillor'
      name = councillor.xpath('.//a')[0].text_content()

      url = councillor.xpath('.//a')[0].attrib['href']
      page = lxmlize(url)

      image = page.xpath('//img[contains(@src, "councilImages")]/@src')[0]
      address = page.xpath('//*[contains(text(),"Address")]/ancestor::td')[-1].text_content().split(':')[-1].replace("\t", '')
      phone = page.xpath('//*[contains(text(),"Tel")]/ancestor::td')[-1].text_content().split(':')[-1].replace("\t", '')
      phone = phone.replace('(', '').replace(') ', '-')
      if page.xpath('//*[contains(text(),"Fax")]'):
        fax = page.xpath('//*[contains(text(),"Fax")]/ancestor::td')[-1].text_content().split(':')[-1].replace("\t", '')
        fax = fax.replace('(', '').replace(') ', '-')
      email = page.xpath('//a[contains(@href,"mailto:")]')[0].text_content()

      p = Legislator(name=name, post_id=district, role=role)
      p.add_source(COUNCIL_PAGE)
      p.add_source(url)
      p.add_contact('address', address, 'legislature')
      p.add_contact('voice', phone, 'legislature')
      p.add_contact('fax', fax, 'legislature')
      p.add_contact('email', email, None)
      p.image = image
      yield p
コード例 #11
0
    def get_people(self):
        page = lxmlize(COUNCIL_PAGE)

        # it's all javascript rendered on the client... wow.
        js = page.xpath(
            'string(//div[@class="inner_container"]/div/script[2])')
        districts = re.findall(r'arrayDistricts\[a.+"(.+)"', js)
        members = re.findall(r'arrayMembres\[a.+"(.+)"', js)
        urls = re.findall(r'arrayLiens\[a.+"(.+)"', js)
        # first item in list is mayor
        p = Legislator(name=members[0], post_id='Gatineau', role='Maire')
        p.add_source(COUNCIL_PAGE)
        mayor_page = lxmlize(MAYOR_CONTACT_PAGE)
        p.add_source(MAYOR_CONTACT_PAGE)
        email = '*****@*****.**'  # hardcoded
        p.add_contact('email', email, None)
        yield p

        for district, member, url in zip(districts, members, urls)[1:]:
            profile_url = COUNCIL_PAGE + '/' + url.split('/')[-1]
            profile_page = lxmlize(profile_url)
            photo_url = profile_page.xpath('string(//img/@src)')
            post_id = 'District ' + re.search('\d+', district).group(0)
            email = profile_page.xpath(
                'string(//a[contains(@href, "mailto:")]/@href)')[len('mailto:'
                                                                     ):]
            p = Legislator(name=member, post_id=post_id, role='Conseiller')
            p.add_source(COUNCIL_PAGE)
            p.add_source(profile_url)
            p.image = photo_url
            p.add_contact('email', email, None)
            yield p
コード例 #12
0
def scrape_mayor(url):
    page = lxmlize(url)
    name = page.xpath('//tr/td/p')[-1]
    name = name.text_content().replace('Mayor', '')
    image = page.xpath('//div[@class="sask_ArticleBody"]//img/@src')[0]

    contact_url = page.xpath(
        '//a[contains(text(), "Contact the Mayor")]/@href')[0]
    page = lxmlize(contact_url)

    address = ' '.join(
        page.xpath(
            '//div[@id="ctl00_PlaceHolderMain_RichHtmlField1__ControlWrapper_RichHtmlField"]/p[4]/text()'
        )[1:])
    phone = page.xpath(
        '//div[@id="ctl00_PlaceHolderMain_RichHtmlField1__ControlWrapper_RichHtmlField"]/p[5]/span/text()'
    )[0].replace('(', '').replace(') ', '-')
    fax = page.xpath(
        '//div[@id="ctl00_PlaceHolderMain_RichHtmlField1__ControlWrapper_RichHtmlField"]/p[6]/span/text()'
    )[0].replace('(', '').replace(') ', '-')

    p = Legislator(name=name, post_id='Saskatoon', role='Mayor')
    p.add_source(url)
    p.image = image
    p.add_contact('address', address, 'legislature')
    p.add_contact('voice', phone, 'legislature')
    p.add_contact('fax', fax, 'legislature')
    return p
コード例 #13
0
ファイル: people.py プロジェクト: fchagnon/scrapers-ca
    def scrape_mayor(self, name, url):
        page = lxmlize(url)

        contact = page.xpath(
            '//div[@id="secondary align_RightSideBar"]/blockquote/p/text()')
        phone = contact[0]
        fax = contact[1]
        email = page.xpath(
            '//div[@id="secondary align_RightSideBar"]/blockquote/p/a[contains(@href, "mailto:")]/text()'
        )[0]

        mayor_page = lxmlize('http://www.burlingtonmayor.com')
        contact_url = mayor_page.xpath(
            '//div[@class="menu"]//a[contains(text(),"Contact")]'
        )[0].attrib['href']
        mayor_page = lxmlize(contact_url)
        address = mayor_page.xpath(
            '//div[@class="entry-content"]//p[contains(text(),"City Hall")]'
        )[0].text_content()

        p = Legislator(name=name, post_id="Burlington", role='Mayor')
        p.add_source(COUNCIL_PAGE)
        p.add_source(url)
        p.add_source('http://www.burlingtonmayor.com')

        p.image = page.xpath(
            '//div[@id="secondary align_RightSideBar"]/p/img/@src')[0]
        p.add_contact('voice', phone, 'legislature')
        p.add_contact('fax', fax, 'legislature')
        p.add_contact('email', email, None)
        p.add_contact('address', address, 'legislature')

        return p
コード例 #14
0
ファイル: people.py プロジェクト: fchagnon/scrapers-ca
  def scrape_mayor(self, div):
    url = div.attrib['href']
    page = lxmlize(url)

    name = div.text_content().replace('Mayor ', '')
    contact_url = page.xpath('//ul[@class="navSecondary"]//a[contains(text(),"Contact")]')[0].attrib['href']
    page = lxmlize(contact_url)

    contact_div = page.xpath('//div[@class="col"][2]')[0]

    address = contact_div.xpath('.//p[1]')[0].text_content()
    address = re.findall(r'(City of Greater .*)', address, flags=re.DOTALL)[0]
    phone = contact_div.xpath('.//p[2]')[0].text_content()
    phone = phone.replace('Phone: ', '')
    fax = contact_div.xpath('.//p[3]')[0].text_content()
    fax = fax.split(' ')[-1]
    email = contact_div.xpath('//a[contains(@href, "mailto:")]')[0].text_content()

    p = Legislator(name=name, post_id='Greater Sudbury', role='Mayor')
    p.add_source(COUNCIL_PAGE)
    p.add_source(contact_url)
    p.add_contact('address', address, 'legislature')
    p.add_contact('voice', phone, 'legislature')
    p.add_contact('fax', fax, 'legislature')
    p.add_contact('email', email, None)
    return p
コード例 #15
0
  def get_people(self):
    page = lxmlize(COUNCIL_PAGE)

    mayor_url = page.xpath('//a[contains(text(), "Mayor")]/@href')[0]
    yield self.scrape_mayor(mayor_url)

    councillors_url = page.xpath('//a[contains(text(), "Councillors")]/@href')[0]
    cpage = lxmlize(councillors_url)

    councillor_rows = cpage.xpath('//tr[td//img]')[:-1]
    for councillor_row in councillor_rows:
      img_cell, info_cell = tuple(councillor_row)
      name = info_cell.xpath(
         'string(.//span[contains(text(), "Councillor")])')[len('Councillor '):]
      district = info_cell.xpath('string(.//p[contains(text(), "District")])')
      email = info_cell.xpath('string(.//a[contains(@href, "mailto:")])')
      if not email:
        email = info_cell.xpath('string(.//strong[contains(text(), "E-mail")]/following-sibling::text())')
      phone = info_cell.xpath(
          'string(.//p[contains(.//text(), "Telephone:")])').split(':')[1]
      img_url_rel = img_cell.xpath('string(//img/@href)')
      img_url = urljoin(councillors_url, img_url_rel)

      p = Legislator(name=name, post_id=district, role='Conseiller')
      p.add_source(COUNCIL_PAGE)
      p.add_source(councillors_url)
      p.add_contact('email', email, None)
      p.add_contact('voice', phone, 'legislature')
      p.image = img_url
      yield p
コード例 #16
0
    def get_people(self):
        page = lxmlize(COUNCIL_PAGE, 'iso-8859-1')
        nodes = page.xpath('//table[@width="484"]//tr')
        try:
            for district_row, councillor_row, contact_row, _ in chunks(
                    nodes, 4):
                post_id = district_row.xpath('string(.//strong)')
                name = councillor_row.xpath('string(.)')[len('Councillor '):]
                # TODO: phone numbers on site don't include area code. Add manually?
                #phone = contact_row.xpath('string(td[2]/text())')
                email = contact_row.xpath('string(td[4]/a)').replace(
                    '[at]', '@')

                p = Legislator(name=name, post_id=post_id, role='Councillor')
                p.add_source(COUNCIL_PAGE)
                #p.add_contact('voice', phone, 'legislature')
                p.add_contact('email', email, None)
                yield p
        except ValueError:
            # on the last run through, there will be less than 4 rows to unpack
            pass

        mayor_page = lxmlize(MAYOR_PAGE, 'iso-8859-1')
        name = mayor_page.xpath(
            'string(//h1[contains(., "Bio")])')[:-len(' Bio')]
        contact_page = lxmlize(MAYOR_CONTACT_URL, 'iso-8859-1')
        email = contact_page.xpath('string(//a[contains(., "@")][1])')

        p = Legislator(name=name, post_id='Halifax', role='Councillor')
        p.add_source(MAYOR_PAGE)
        p.add_source(MAYOR_CONTACT_URL)
        p.add_contact('email', email, None)
        yield p
コード例 #17
0
    def scrape_mayor(self, div):
        url = div.attrib['href']
        page = lxmlize(url)

        name = div.text_content().replace('Mayor ', '')
        contact_url = page.xpath(
            '//ul[@class="navSecondary"]//a[contains(text(),"Contact")]'
        )[0].attrib['href']
        page = lxmlize(contact_url)

        contact_div = page.xpath('//div[@class="col"][2]')[0]

        address = contact_div.xpath('.//p[1]')[0].text_content()
        address = re.findall(r'(City of Greater .*)', address,
                             flags=re.DOTALL)[0]
        phone = contact_div.xpath('.//p[2]')[0].text_content()
        phone = phone.replace('Phone: ', '')
        fax = contact_div.xpath('.//p[3]')[0].text_content()
        fax = fax.split(' ')[-1]
        email = contact_div.xpath(
            '//a[contains(@href, "mailto:")]')[0].text_content()

        p = Legislator(name=name, post_id='Greater Sudbury', role='Mayor')
        p.add_source(COUNCIL_PAGE)
        p.add_source(contact_url)
        p.add_contact('address', address, 'legislature')
        p.add_contact('voice', phone, 'legislature')
        p.add_contact('fax', fax, 'legislature')
        p.add_contact('email', email, None)
        return p
コード例 #18
0
ファイル: people.py プロジェクト: fchagnon/scrapers-ca
  def get_people(self):
    page = lxmlize(COUNCIL_PAGE)

    councillors = page.xpath('//div[@class="article-content"]//td[@class="ms-rteTableOddCol-0"]')
    yield scrape_mayor(councillors[0])
    for councillor in councillors[1:]:
      if not councillor.xpath('.//a'):
        continue

      name = councillor.xpath('.//a')[0].text_content().strip()
      district = councillor.xpath('.//a')[1].text_content()
      url = councillor.xpath('.//a/@href')[0]
      page = lxmlize(url)

      p = Legislator(name=name, post_id=district, role='Conseiller')
      p.add_source(COUNCIL_PAGE)
      p.add_source(url)

      p.image = councillor.xpath('./preceding-sibling::td//img/@src')[-1]

      contacts = page.xpath('.//td[@class="ms-rteTableOddCol-0"]//text()')
      for contact in contacts:
        if re.findall(r'[0-9]', contact):
          phone = contact.strip().replace(' ', '-')
          p.add_contact('voice', phone, 'legislature')
      get_links(p, page.xpath('.//td[@class="ms-rteTableOddCol-0"]')[0])

      email = page.xpath(
        'string(//a[contains(@href, "mailto:")]/@href)')[len('mailto:'):]
      p.add_contact('email', email, None)
      yield p
コード例 #19
0
  def get_people(self):
    page = lxmlize(COUNCIL_PAGE)

    councillors = page.xpath('//div[@id="c2087"]//a')
    for councillor in councillors:
      name = councillor.text_content()
      url = councillor.attrib['href']
      page = lxmlize(url)
      if 'Maire' in page.xpath('//h2/text()')[0]:
        district = 'Sherbrooke'
        role = 'Maire'
      else:
        district = page.xpath('//div[@class="csc-default"]//a[@target="_blank"]/text()')[0].replace('district', '').replace('Domaine Howard', 'Domaine-Howard').strip()
        role = 'Conseiller'
      if district in ('de Brompton', 'de Lennoxville'):
        district = district.replace('de ', '')
      p = Legislator(name=name, post_id=district, role=role)
      p.add_source(COUNCIL_PAGE)
      p.add_source(url)
      p.image = page.xpath('//div[@class="csc-textpic-image csc-textpic-last"]//img/@src')[0]
      parts = page.xpath('//li[contains(text(), "phone")]/text()')[0].split(':')
      note = parts[0]
      phone = parts[1]
      p.add_contact(note, phone, note)
      email = page.xpath('//a[contains(@href, "mailto:")]/@href')
      if email:
        email = email[0].split(':')[1]
        p.add_contact('email', email, None)
      if district == 'Brompton':
        p.add_extra('boundary_url', '/boundaries/sherbrooke-boroughs/brompton/')
      elif district == 'Lennoxville':
        p.add_extra('boundary_url', '/boundaries/sherbrooke-boroughs/lennoxville/')
      yield p
コード例 #20
0
ファイル: people.py プロジェクト: rhymeswithcycle/scrapers-ca
    def get_people(self):
        # mayor first, can't find email
        page = lxmlize(MAYOR_URL)
        photo_url = page.xpath('string(//img/@src[contains(., "Maire")])')
        name = page.xpath('string(//td[@class="contenu"]/text()[last()])')
        p = Legislator(name=name, post_id=u"Trois-Rivières", role="Maire", image=photo_url)
        p.add_source(MAYOR_URL)
        yield p

        resp = requests.get(COUNCIL_PAGE)
        # page rendering through JS on the client
        page_re = re.compile(r'createItemNiv3.+"District (.+?)".+(index.+)\\"')
        for district, url_rel in page_re.findall(resp.text):
            if district not in ("des Estacades", "des Plateaux", "des Terrasses", "du Sanctuaire"):
                district = re.sub("\A(?:de(?: la)?|des|du) ", "", district)

            url = urljoin(COUNCIL_PAGE, url_rel)
            page = lxmlize(url)
            name = page.xpath("string(//h2)")
            email = page.xpath('string(//a/@href[contains(., "mailto:")])')[len("mailto:") :]
            photo_url = page.xpath('string(//img/@src[contains(., "Conseiller")])')
            p = Legislator(name=name, post_id=district, role="Conseiller", image=photo_url)
            p.add_source(url)
            p.add_contact("email", email, None)
            yield p
コード例 #21
0
ファイル: people.py プロジェクト: fchagnon/scrapers-ca
  def get_people(self):
    page = lxmlize(COUNCIL_PAGE)

    councillors = page.xpath('//ul[@class="subNav top"]/li/ul//li/a')
    for councillor in councillors:
      name = councillor.text_content()

      url = councillor.attrib['href']
      page = lxmlize(url)

      if councillor == councillors[0]:
        district = 'Ajax'
        role = 'Mayor'
      else:
        district = re.findall(r'Ward.*', page.xpath('//div[@id="printAreaContent"]//h1')[0].text_content())[0].strip()
        role = page.xpath('//div[@id="printAreaContent"]//h1')[0].text_content()
        role = re.findall('((Regional)? ?(Councillor))', role)[0][0]

      p = Legislator(name=name, post_id=district, role=role)
      p.add_source(COUNCIL_PAGE)
      p.add_source(url)

      p.image = page.xpath('//div[@class="intQuicklinksPhoto"]/img/@src')[0]

      contact_info = page.xpath('//table[@class="datatable"][1]//tr')[1:]
      for line in contact_info:
        contact_type = line.xpath('./td')[0].text_content().strip()
        contact = line.xpath('./td')[1].text_content().strip()
        if re.match(r'(Phone)|(Fax)|(Email)', contact_type):
          contact_type = CONTACT_DETAIL_TYPE_MAP[contact_type]
          p.add_contact(contact_type, contact, None if contact_type == 'email' else 'legislature')
        else:
          p.add_link(contact, None)
      yield p
コード例 #22
0
  def get_people(self):
    page = lxmlize(COUNCIL_PAGE)

    member_cells = page.xpath(
        '//div[@class="views-field views-field-field-picture"]/'
        'parent::td')
    for cell in member_cells:
      name = cell[1].text_content().replace(' .', '. ') # typo on page
      riding = cell[2].text_content()
      if 'Mackenzie Delta' in riding:
        riding = 'Mackenzie-Delta'
      detail_url = cell[0].xpath('string(.//a/@href)')
      detail_page = lxmlize(detail_url)
      photo_url = detail_page.xpath(
          'string(//div[@class="field-item even"]/img/@src)')
      email = detail_page.xpath('string(//a[contains(@href, "mailto:")])')

      contact_text = detail_page.xpath(
          'string(//div[@property="content:encoded"]/p[1])')
      phone = re.search(r'P(hone)?: ([-0-9]+)', contact_text).group(2)

      p = Legislator(name=name, post_id=riding, role='MLA', image=photo_url)
      p.add_source(COUNCIL_PAGE)
      p.add_source(detail_url)
      p.add_contact('email', email, None)
      p.add_contact('voice', phone, 'legislature')
      yield p
コード例 #23
0
ファイル: people.py プロジェクト: fchagnon/scrapers-ca
  def get_people(self):
    page = lxmlize(COUNCIL_PAGE)

    councillors = page.xpath('//div[@id="navMultilevel"]//a')
    for councillor in councillors:
      if councillor == councillors[0]:
        yield self.scrape_mayor(councillor)
        continue

      if not '-' in councillor.text_content():
        break

      district, name = councillor.text_content().split(' - ')
      if name == 'Vacant':
        continue

      page = lxmlize(councillor.attrib['href'])

      address = page.xpath('//div[@class="column last"]//p')[0].text_content()
      phone = page.xpath('//article[@id="primary"]//*[contains(text(),"Tel")]')[0].text_content()
      phone = re.findall(r'([0-9].*)', phone)[0].replace(') ', '-')
      fax = page.xpath('//article[@id="primary"]//*[contains(text(),"Fax")]')[0].text_content()
      fax = re.findall(r'([0-9].*)', fax)[0].replace(') ', '-')
      email = page.xpath('//a[contains(@href, "mailto:")]')[0].text_content()

      p = Legislator(name=name, post_id=district, role='Councillor')
      p.add_source(COUNCIL_PAGE)
      p.add_source(councillor.attrib['href'])
      p.add_contact('address', address, 'legislature')
      p.add_contact('voice', phone, 'legislature')
      p.add_contact('fax', fax, 'legislature')
      p.add_contact('email', email, None)
      p.image = page.xpath('//article[@id="primary"]//img/@src')[1]
      yield p
コード例 #24
0
ファイル: people.py プロジェクト: fchagnon/scrapers-ca
  def get_people(self):
    page = lxmlize(COUNCIL_PAGE)
    councillor_links = page.xpath('//li[@id="pageid2117"]/ul/li/a')[2:10]
    for link in councillor_links:
      if not link.text.startswith('Councillor'):
        continue
      url = link.attrib['href']
      page = lxmlize(url)
      mail_link = page.xpath('//a[@title]')[0]
      name = mail_link.attrib['title']
      email = mail_link.attrib['href'][len('mailto:'):]
      photo_url = page.xpath('string(//div[@class="pageContent"]//img[@align="right"]/@src)')
      p = Legislator(name=name, post_id='Abbotsford', role='Councillor',
                     image=photo_url)
      p.add_source(url)
      p.add_contact('email', email, None)
      yield p

    page = lxmlize(MAYOR_URL)
    name = page.xpath('string(//h1)').split(' ', 1)[1]
    photo_url = page.xpath('string(//img[@hspace=10]/@src)')
    # email is hidden behind a form
    p = Legislator(name=name, post_id='Abbotsford', role='Mayor', image=photo_url)
    p.add_source(MAYOR_URL)
    yield p
コード例 #25
0
ファイル: people.py プロジェクト: fchagnon/scrapers-ca
  def get_people(self):
    page = lxmlize(COUNCIL_PAGE)

    # it's all javascript rendered on the client... wow.
    js = page.xpath('string(//div[@class="inner_container"]/div/script[2])')
    districts = re.findall(r'arrayDistricts\[a.+"(.+)"', js)
    members = re.findall(r'arrayMembres\[a.+"(.+)"', js)
    urls = re.findall(r'arrayLiens\[a.+"(.+)"', js)
    # first item in list is mayor
    p = Legislator(name=members[0], post_id = 'Gatineau', role='Maire')
    p.add_source(COUNCIL_PAGE)
    mayor_page = lxmlize(MAYOR_CONTACT_PAGE)
    p.add_source(MAYOR_CONTACT_PAGE)
    email = '*****@*****.**' # hardcoded
    p.add_contact('email', email, None)
    yield p

    for district, member, url in zip(districts, members, urls)[1:]:
      profile_url = COUNCIL_PAGE + '/' + url.split('/')[-1]
      profile_page = lxmlize(profile_url)
      photo_url = profile_page.xpath('string(//img/@src)')
      post_id = 'District ' + re.search('\d+', district).group(0)
      email = profile_page.xpath(
          'string(//a[contains(@href, "mailto:")]/@href)')[len('mailto:'):]
      p = Legislator(name=member, post_id=post_id, role='Conseiller')
      p.add_source(COUNCIL_PAGE)
      p.add_source(profile_url)
      p.image = photo_url
      p.add_contact('email', email, None)
      yield p
コード例 #26
0
  def get_people(self):
    page = lxmlize(COUNCIL_PAGE)
    councillor_divs = page.xpath('//div[@class="councillorCard"]');
    for councillor_div in councillor_divs:
      yield councillor_data(councillor_div)

    mayor_page = lxmlize(MAYOR_PAGE)
    yield mayor_data(mayor_page)
コード例 #27
0
    def get_people(self):
        page = lxmlize(COUNCIL_PAGE)

        councillors = page.xpath('//div[@id="printArea"]//table//tr//td')[4:-1]
        yield self.scrape_mayor(councillors[0])
        for councillor in councillors[1:]:
            name = ' '.join(
                councillor.xpath('string(.//strong/a[last()])').split())
            infostr = councillor.xpath('string(.//strong)')
            try:
                district = infostr.split('-')[1]
                role = 'Councillor'
            except IndexError:
                district = 'Newmarket'
                role = 'Regional Councillor'
            url = councillor.xpath('.//a/@href')[0]

            p = Legislator(name=name, post_id=district, role=role)
            p.add_source(COUNCIL_PAGE)
            p.add_source(url)

            p.image = councillor.xpath('.//img/@src')[0]

            page = lxmlize(url)
            info = page.xpath('//div[@id="printArea"]')[0]
            info = info.xpath('.//p[@class="heading"][2]/following-sibling::p')
            address = info.pop(0).text_content().strip()
            if not address:
                address = info.pop(0).text_content().strip()

            if 'Ward' in info[0].text_content():
                info.pop(0)

            numbers = info.pop(0).text_content().split(':')
            email = page.xpath('//a[contains(@href, "mailto:")]/text()')[0]
            p.add_contact('email', email, None)
            for i, contact in enumerate(numbers):
                if i == 0:
                    continue
                if '@' in contact:
                    continue  # executive assistant email
                else:
                    number = re.findall(r'([0-9]{3}-[0-9]{3}-[0-9]{4})',
                                        contact)[0]
                    ext = re.findall(r'(Ext\. [0-9]{3,4})', contact)
                    if ext:
                        number = number + ext[0].replace('Ext. ', ' x')
                    contact_type = re.findall(r'[A-Za-z]+$', numbers[i - 1])[0]
                if 'Fax' in contact_type:
                    p.add_contact('fax', number, 'legislature')
                elif 'Phone' in contact_type:
                    p.add_contact('voice', number, 'legislature')
                else:
                    p.add_contact(contact_type, number, contact_type)
            site = page.xpath('.//a[contains(text(), "http://")]')
            if site:
                p.add_link(site[0].text_content(), None)
            yield p
コード例 #28
0
    def get_people(self):
        page = lxmlize(COUNCIL_PAGE)

        councillors = page.xpath('//a[contains(@title, "Profile")][1]/@href')
        for councillor in councillors:
            page = lxmlize(councillor)
            info = page.xpath('//table/tbody/tr/td[2]')[0]

            for br in info.xpath('*//br'):
                br.tail = '\n' + br.tail if br.tail else '\n'
            lines = [
                line.strip() for line in info.text_content().split('\n')
                if line.strip()
            ]
            text = '\n'.join(lines)
            name = lines[0].replace('Councillor ', '').replace('Mayor ', '')

            if lines[1].endswith(' Ward'):
                district = lines[1].replace(' Ward', '')
                role = 'Councillor'
            elif lines[1] == 'At Large':
                district = 'Thunder Bay'
                role = 'Councillor'
            else:
                district = 'Thunder Bay'
                role = 'Mayor'
            name = name.replace('Councillor',
                                '').replace('At Large',
                                            '').replace('Mayor', '').strip()

            p = Legislator(name=name, post_id=district, role=role)
            p.add_source(COUNCIL_PAGE)
            p.add_source(councillor)

            p.image = page.xpath('//td[@valign="top"]/img/@src')[0]

            address = ', '.join(info.xpath('./p/text()')[0:2]).strip()
            address = re.sub(r'\s{2,}', ' ', address)

            p.add_contact('address', address, 'legislature')

            contacts = info.xpath('./p[2]/text()')
            for contact in contacts:
                contact_type, contact = contact.split(':')
                contact = contact.replace('(1st)', '').replace('(2nd)',
                                                               '').strip()
                if 'Fax' in contact_type:
                    p.add_contact('fax', contact, 'legislature')
                elif 'Email' in contact_type:
                    break
                else:
                    p.add_contact('voice', contact, contact_type)

            email = info.xpath(
                './/a[contains(@href, "mailto:")]')[0].text_content()
            p.add_contact('email', email, None)

            yield p
コード例 #29
0
ファイル: people.py プロジェクト: fchagnon/scrapers-ca
  def get_people(self):
    page = lxmlize(COUNCIL_PAGE)

    councillors = page.xpath('//div[@id="printArea"]//table//tr//td')[4:-1]
    yield self.scrape_mayor(councillors[0])
    for councillor in councillors[1:]:
      name = ' '.join(councillor.xpath('string(.//strong/a[last()])').split())
      infostr = councillor.xpath('string(.//strong)')
      try:
        district = infostr.split('-')[1]
        role = 'Councillor'
      except IndexError:
        district = 'Newmarket'
        role = 'Regional Councillor'
      url = councillor.xpath('.//a/@href')[0]

      p = Legislator(name=name, post_id=district, role=role)
      p.add_source(COUNCIL_PAGE)
      p.add_source(url)

      p.image = councillor.xpath('.//img/@src')[0]

      page = lxmlize(url)
      info = page.xpath('//div[@id="printArea"]')[0]
      info = info.xpath('.//p[@class="heading"][2]/following-sibling::p')
      address = info.pop(0).text_content().strip()
      if not address:
        address = info.pop(0).text_content().strip()

      if 'Ward' in info[0].text_content():
        info.pop(0)

      numbers = info.pop(0).text_content().split(':')
      email = page.xpath('//a[contains(@href, "mailto:")]/text()')[0]
      p.add_contact('email', email, None)
      for i, contact in enumerate(numbers):
        if i == 0:
          continue
        if '@' in contact:
          continue  # executive assistant email
        else:
          number = re.findall(r'([0-9]{3}-[0-9]{3}-[0-9]{4})', contact)[0]
          ext = re.findall(r'(Ext\. [0-9]{3,4})', contact)
          if ext:
            number = number + ext[0].replace('Ext. ', ' x')
          contact_type = re.findall(r'[A-Za-z]+$', numbers[i - 1])[0]
        if 'Fax' in contact_type:
          p.add_contact('fax', number, 'legislature')
        elif 'Phone' in contact_type:
          p.add_contact('voice', number, 'legislature')
        else:
          p.add_contact(contact_type, number, contact_type)
      site = page.xpath('.//a[contains(text(), "http://")]')
      if site:
        p.add_link(site[0].text_content(), None)
      yield p
コード例 #30
0
ファイル: people.py プロジェクト: fchagnon/scrapers-ca
  def get_people(self):
    page = lxmlize(COUNCIL_PAGE)

    a = page.xpath('//a[contains(@href,"mayor")]')[0]
    yield self.scrape_mayor(a.attrib['href'])

    for a in page.xpath('//a[contains(@href,"councillors/")]'):
      page = lxmlize(a.attrib['href'])
      h1 = page.xpath('string(//h1)')
      if 'Council seat is vacant' not in h1:
        yield self.scrape_councilor(page, h1, a.attrib['href'])
コード例 #31
0
ファイル: people.py プロジェクト: fchagnon/scrapers-ca
    def get_people(self):
        page = lxmlize(COUNCIL_PAGE)

        a = page.xpath('//a[contains(@href,"mayor")]')[0]
        yield self.scrape_mayor(a.attrib['href'])

        for a in page.xpath('//a[contains(@href,"councillors/")]'):
            page = lxmlize(a.attrib['href'])
            h1 = page.xpath('string(//h1)')
            if 'Council seat is vacant' not in h1:
                yield self.scrape_councilor(page, h1, a.attrib['href'])
コード例 #32
0
ファイル: people.py プロジェクト: fchagnon/scrapers-ca
    def get_people(self):
        page = lxmlize(COUNCIL_PAGE)

        councillors = page.xpath('//*[@class="two_third last"]')
        for councillor in councillors:
            if councillor == councillors[0]:
                yield self.scrape_mayor(councillor)
                continue

            name = councillor.xpath('.//a')[0].text_content().replace(
                'Councillor', '').replace('Mayor', '')
            info = councillor.xpath('.//text()[normalize-space()]')
            district = info[2]
            url = councillor.xpath('.//a')[0].attrib['href']

            p = Legislator(name=name, post_id=district, role='Councillor')
            p.add_source(COUNCIL_PAGE)
            p.add_source(url)

            p.add_contact('voice', info[3].replace('extension', 'x'),
                          'legislature')
            email = councillor.xpath('.//a[contains(@href,"mailto:")]')
            if email:
                email = email[0].text_content()
                p.add_contact('email', email, None)

            site = councillor.xpath('.//a[contains(text(),"Website")]')
            if site:
                p.add_link(site[0].attrib['href'], None)

            page = lxmlize(url)

            p.image = page.xpath('//header/img/@src')[0]

            address = re.findall(
                r'Address: (.*)Phone',
                page.xpath('//div[@class="entry-content"]')[0].text_content())
            if address:
                p.add_contact('address', address[0], 'legislature')

            blog = page.xpath('//a[contains(text(),"Blog")]')
            if blog:
                p.add_link(blog[0].attrib['href'], None)

            facebook = page.xpath(
                '//div[@class="entry-content"]//a[contains(@href, "facebook")]'
            )
            if facebook:
                p.add_link(facebook[0].attrib['href'], None)
            twitter = page.xpath(
                '//div[@class="entry-content"]//a[contains(@href, "twitter")]')
            if twitter:
                p.add_link(twitter[0].attrib['href'], None)
            yield p
コード例 #33
0
ファイル: people.py プロジェクト: fchagnon/scrapers-ca
  def get_people(self):
    page = lxmlize(COUNCIL_PAGE)
    councillor_pages = page.xpath('//div[@class="imageLinkContent"]/'
                                  'a[starts-with(text(), "Ward")]/@href')

    for councillor_page in councillor_pages:
      yield councillor_data(councillor_page)

    mayor_page = lxmlize(MAYOR_PAGE)
    mayor_connecting_url = mayor_page.xpath('string(//a[@class="headingLink"]'
      '[contains(text(), "Connecting")]/@href)')
    yield mayor_data(mayor_connecting_url)
コード例 #34
0
ファイル: people.py プロジェクト: fchagnon/scrapers-ca
    def get_people(self):
        page = lxmlize(COUNCIL_PAGE)
        councillor_pages = page.xpath('//div[@class="imageLinkContent"]/'
                                      'a[starts-with(text(), "Ward")]/@href')

        for councillor_page in councillor_pages:
            yield councillor_data(councillor_page)

        mayor_page = lxmlize(MAYOR_PAGE)
        mayor_connecting_url = mayor_page.xpath(
            'string(//a[@class="headingLink"]'
            '[contains(text(), "Connecting")]/@href)')
        yield mayor_data(mayor_connecting_url)
コード例 #35
0
ファイル: people.py プロジェクト: fchagnon/scrapers-ca
  def get_people(self):
    page = lxmlize(COUNCIL_PAGE)
    types = page.xpath('//div[@class="bluearrow shaded bottomborder "][1]/ul/li/a/@href')[:4]
    for org_type, link in enumerate(types):
      page = lxmlize(link)
      district_urls = page.xpath('//div[@class="parbase list section cplist"]/table/tr/td[1]/b/a/@href')
      for district_url in district_urls:
        page = lxmlize(district_url)
        district = page.xpath('//div[@class="pageHeader"]/h1/text()')[0].split(' - ')[1].strip()

        org = Organization(name=district + org_types[org_type], classification='legislature', jurisdiction_id=self.jurisdiction.jurisdiction_id)
        org.add_source(district_url)
        yield org

        address = ', '.join(page.xpath('//div[@class="left_contents"]/p[1]/text()'))
        contacts = page.xpath('//div[@class="left_contents"]/p[b[text() = "Contact"]]/text()')
        phone = contacts[0].split(':')[1].strip().replace(' ', '-')
        fax = contacts[1].split(':')[1].strip().replace(' ', '-')
        email = page.xpath('//div[@class="left_contents"]//a[contains(@href, "mailto:")]')
        if email:
          email = email[0].text_content()

        site = page.xpath('//div[@class="left_contents"]//a[not(contains(@href,"mailto:"))]')
        if site:
          site = site[0].text_content()

        councillors = page.xpath('//div[@class="right_contents"]//p/text()')
        for i, councillor in enumerate(councillors):
          if 'Vacant' in councillor:
            continue
          p = Legislator(name=councillor, post_id=district)
          p.add_source(COUNCIL_PAGE)
          p.add_source(link)
          p.add_source(district_url)

          if i == 0:
            membership = p.add_membership(org, role='Mayor')
          else:
            membership = p.add_membership(org, role='Councillor')

          membership.post_id = district
          membership.add_contact_detail('address', address, 'legislature')
          if phone:
            membership.add_contact_detail('voice', phone, 'legislature')
          if fax:
            membership.add_contact_detail('fax', fax, 'legislature')
          if email:
            membership.add_contact_detail('email', email, None)
          if site:
            p.add_link(site, None)
          yield p
コード例 #36
0
ファイル: people.py プロジェクト: fchagnon/scrapers-ca
  def get_people(self):
    page = lxmlize(COUNCIL_PAGE)

    councillors = page.xpath('//a[contains(@title, "Profile")][1]/@href')
    for councillor in councillors:
      page = lxmlize(councillor)
      info = page.xpath('//table/tbody/tr/td[2]')[0]

      for br in info.xpath('*//br'):
        br.tail = '\n' + br.tail if br.tail else '\n'
      lines = [line.strip() for line in info.text_content().split('\n') if line.strip()]
      text = '\n'.join(lines)
      name = lines[0].replace('Councillor ', '').replace('Mayor ', '')

      if lines[1].endswith(' Ward'):
        district = lines[1].replace(' Ward', '')
        role = 'Councillor'
      elif lines[1] == 'At Large':
        district = 'Thunder Bay'
        role = 'Councillor'
      else:
        district = 'Thunder Bay'
        role = 'Mayor'
      name = name.replace('Councillor', '').replace('At Large', '').replace('Mayor', '').strip()

      p = Legislator(name=name, post_id=district, role=role)
      p.add_source(COUNCIL_PAGE)
      p.add_source(councillor)

      p.image = page.xpath('//td[@valign="top"]/img/@src')[0]

      address = ', '.join(info.xpath('./p/text()')[0:2]).strip()
      address = re.sub(r'\s{2,}', ' ', address)

      p.add_contact('address', address, 'legislature')

      contacts = info.xpath('./p[2]/text()')
      for contact in contacts:
        contact_type, contact = contact.split(':')
        contact = contact.replace('(1st)', '').replace('(2nd)', '').strip()
        if 'Fax' in contact_type:
          p.add_contact('fax', contact, 'legislature')
        elif 'Email' in contact_type:
          break
        else:
          p.add_contact('voice', contact, contact_type)

      email = info.xpath('.//a[contains(@href, "mailto:")]')[0].text_content()
      p.add_contact('email', email, None)

      yield p
コード例 #37
0
ファイル: people.py プロジェクト: fchagnon/scrapers-ca
  def get_people(self):
    page = lxmlize(COUNCIL_PAGE)

    mayor_url = page.xpath('//a[contains(text(), "Office of the Mayor")]/@href')[0]
    yield scrape_mayor(mayor_url)

    councillors = page.xpath('//div[@class="interiorContentWrapper"]//td[./a]')
    for councillor in councillors:
      name = councillor.xpath('.//strong')[1].text_content().strip()
      district = councillor.xpath('.//a//text()[normalize-space()]')[0]
      if 'Ward' in district:
        district = district.replace('Councillor', '')
        role = 'Councillor'
      else:
        role = district
        district = 'Markham'

      image = councillor.xpath('.//img/@src')[0]
      url = councillor.xpath('.//a/@href')[0]

      if 'Ward 4' in district:
        yield scrape_4(name, url, image)
        continue

      page = lxmlize(url)

      p = Legislator(name=name, post_id=district, role=role)
      p.add_source(COUNCIL_PAGE)
      p.add_source(url)

      p.image = image
      contact = page.xpath('//div[@class="microSiteLinksWrapper"]')[1]

      if contact.xpath('.//p/text()'):
        infos = contact.xpath('.//p/text()')
      else:
        infos = contact.xpath('.//div/text()')

      address = re.sub(r'\s{2,}', ' ', ' '.join(infos[:2])).strip()
      phone = infos[2].split(':')[1].strip()
      email = contact.xpath('.//a[contains(@href,"mailto:")]/text()')[0]
      website = contact.xpath('.//a[not( contains(@href, "mailto:"))]/text()')
      if website:
        p.add_link(website[0], None)
      p.add_contact('address', address, 'legislature')
      p.add_contact('voice', phone, 'legislature')
      p.add_contact('email', email, None)

      get_links(p, contact)
      yield p
コード例 #38
0
ファイル: people.py プロジェクト: fchagnon/scrapers-ca
  def get_people(self):
    page = lxmlize(COUNCIL_PAGE)
    for person_link in page.xpath('//a[@class="L4"]'):
        role, name = person_link.text_content().split(' ', 1)
        url = person_link.attrib['href']
        page = lxmlize(url)
        photo_url = page.xpath('string(//img[@class="img-right"]/@src)')
        email = page.xpath('string(//a[starts-with(@href, "mailto:")])')

        p = Legislator(name=name, post_id='Coquitlam', role=role, image=photo_url)
        p.add_source(COUNCIL_PAGE)
        p.add_source(url)
        p.add_contact('email', email, None)
        yield p
コード例 #39
0
ファイル: people.py プロジェクト: fchagnon/scrapers-ca
    def get_people(self):
        page = lxmlize(COUNCIL_PAGE)

        councillors = page.xpath('//table[@id="MLAs"]//tr')[1:]
        for councillor in councillors:
            name = councillor.xpath('./td')[0].text_content().split('. ', 1)[1]
            party = councillor.xpath('./td')[1].text
            district = councillor.xpath('./td')[2].text_content()
            url = councillor.xpath('./td[1]/a/@href')[0]
            page = lxmlize(url)

            p = Legislator(name=name,
                           post_id=district,
                           role='MLA',
                           party=party)
            p.add_source(COUNCIL_PAGE)
            p.add_source(url)

            contact = page.xpath('//table[@id="mla-contact"]//tr[2]')[0]
            website = contact.xpath('./td[3]//div[3]//a')
            if website:
                p.add_link(website[0].text_content(), None)

            p.add_contact('address',
                          contact.xpath('./td[1]/div[2]')[0].text_content(),
                          'legislature')
            p.add_contact('address',
                          ''.join(contact.xpath('./td[2]/div//text()')[1:7]),
                          'constituency')
            numbers = [
                contact.xpath('./td[1]/div[3]')[0].text_content().split(
                    ':')[1].strip(),
                contact.xpath('./td[2]/div[4]//span/text()')[0],
                contact.xpath('./td[1]/div[4]')[0].text_content().split(':')
                [1].strip(),
                contact.xpath('./td[2]/div[5]//span/text()')[0],
            ]
            for index, number in enumerate(numbers):
                if len(number) < 10:
                    numbers[index] = '306-%s' % number
            p.add_contact('voice', numbers[0], 'legislature')
            p.add_contact('voice', numbers[1], 'constituency')
            p.add_contact('fax', numbers[2], 'legislature')
            p.add_contact('fax', numbers[3], 'constituency')
            p.add_contact(
                'email',
                contact.xpath('./td[3]//a[contains(@href, "mailto:")]/text()')
                [0], None)

            yield p
コード例 #40
0
  def get_people(self):
    reeve_page = lxmlize(REEVE_URL)
    reeve_name = reeve_page.xpath('string(//b)').split(',')[0]

    page = lxmlize(COUNCIL_PAGE)

    councillors = page.xpath('//table[@class="table-plain"]/tbody/tr/td[2]')
    for councillor in councillors:
      name = councillor.xpath('./h2')[0].text_content().split(
          'Division')[0].strip()
      district = re.findall(r'(Division [0-9])', councillor.xpath('./h2')[0].text_content())[0]

      p = Legislator(name=name, post_id=district, role='Councillor')
      p.add_source(COUNCIL_PAGE)

      image = councillor.xpath('./preceding-sibling::td//img/@src')[0]
      p.image = image

      address = councillor.xpath('./p[1]')[0].text_content()
      email = councillor.xpath('.//a[contains(@href, "mailto:")]')[0].text_content()

      p.add_contact('address', address, 'legislature')
      p.add_contact('email', email, None)

      numbers = councillor.xpath('./p[2]')[0].text_content().replace('Email: ', '').replace(email, '').split(':')
      for index, number in enumerate(numbers):
        if index == 0:
          continue
        contact_type = re.findall(r'[A-Za-z]+', numbers[index - 1])[0]
        number = re.findall(r'[0-9]{3}.[0-9]{3}.[0-9]{4}', number)[0].replace('.', '-')
        if contact_type == 'Fax':
          p.add_contact('fax', number, 'legislature')
        elif contact_type == 'Cell':
          p.add_contact('cell', number, 'legislature')
        elif contact_type == 'Hm':
          p.add_contact('voice', number, 'residence')
        else:
          raise Exception('Unrecognized contact type %s' % contact_type)

      # @todo Uncomment when upgrading from Pupa 0.0.3.
      # if name == reeve_name:
      #   membership = Membership(
      #       p._id,
      #       'jurisdiction::ocd-jurisdiction/country:ca/csd:4819006/council',
      #       post_id='district::Grande Prairie County No. 1',
      #       contact_details=p._contact_details,
      #       role='Reeve')
      #   p._related.append(membership)

      yield p
コード例 #41
0
ファイル: people.py プロジェクト: fchagnon/scrapers-ca
  def get_people(self):
    page = lxmlize(COUNCIL_PAGE)

    councillors = page.xpath('//div[@class="PL_Column1"]//ul[@class="dfwp-list"][1]/li/div/div/a')
    for councillor in councillors:
      url = councillor.attrib['href']
      page = lxmlize(url)

      title = page.xpath('//div[@class="PL_Title"]')[0].text_content()
      if "Councillor" in title:
        district, name = re.split(r'Councillor', title)
        role = 'Councillor'
        if "Regional" in district:
          district = "Vaughan"
          role = 'Regional Councillor'
      else:
        name = re.split(r'Mayor', title)[-1]
        district = 'Vaughan'
        role = 'Mayor'
      name = name.strip()
      if councillor == councillors[0]:
        contact_info = page.xpath('//div[@id="WebPartWPQ2"]')[0]
      else:
        contact_info = page.xpath('//div[@id="WebPartWPQ3"]')[0]

      phone = re.findall(r'[0-9]{3}-[0-9]{3}-[0-9]{4} ext. [0-9]{4}', contact_info.text_content())[0].replace('ext. ', 'x')
      fax = re.findall(r'[0-9]{3}-[0-9]{3}-[0-9]{4}', contact_info.text_content())[1]
      email = contact_info.xpath('.//a[contains(@href, "mailto:")]')[0].text_content()

      p = Legislator(name=name, post_id=district.strip(), role=role)
      p.add_source(COUNCIL_PAGE)
      p.add_source(url)
      p.add_contact('voice', phone, 'legislature')
      p.add_contact('fax', fax, 'legislature')
      p.add_contact('email', email, None)

      image = page.xpath('//img[contains(@alt, "Councillor")]/@src')
      if image:
        p.image = image[0]

      sites = page.xpath('//div[@id="WebPartWPQ5"]')[0]

      if page.xpath('.//a[contains(@href,"facebook")]'):
        p.add_link(page.xpath('.//a[contains(@href,"facebook")]')[0].attrib['href'], None)
      if page.xpath('.//a[contains(@href,"twitter")]'):
        p.add_link(page.xpath('.//a[contains(@href,"twitter")]')[0].attrib['href'], None)
      if page.xpath('.//a[contains(@href,"youtube")]'):
        p.add_link(page.xpath('.//a[contains(@href, "youtube")]')[0].attrib['href'], None)
      yield p
コード例 #42
0
    def get_people(self):
        page = lxmlize(COUNCIL_PAGE)

        mayor_url = page.xpath(
            '//td[@class="sask_LeftNavLinkContainer"]/a/@href')[0]
        yield scrape_mayor(mayor_url)

        email_page = lxmlize(EMAIL_URL)
        c_options = email_page.xpath(
            '//select[@id="councillorList"]/option[contains(text(), "Ward")]')
        email_dict = dict((opt.text.split(' - ')[0], opt.attrib['value'])
                          for opt in c_options)

        councillors = page.xpath(
            '//td[@class="sask_LeftNavChildNodeContainer"]//a')
        for councillor in councillors:
            district, name = councillor.text_content().split(' - Councillor ')
            url = councillor.attrib['href']

            p = Legislator(name=name, post_id=district, role='Councillor')
            p.add_source(COUNCIL_PAGE)
            p.add_source(url)

            page = lxmlize(url)
            try:
                p.add_contact('email', email_dict[district], None)
            except KeyError:
                email = page.xpath(
                    'string(//a[contains(@href, "mailto:")]/@href)')
                p.add_contact('email', email, None)

            contacts = page.xpath('//p[@class="para12"]')[0]
            if not contacts.text_content().strip():
                contacts = page.xpath('//p[@class="para12"]')[1]
            contacts = re.split(r'\xa0', contacts.text_content())
            contacts = [x for x in contacts if x.strip()]
            for i, contact in enumerate(contacts):
                if 'Contact' in contact:
                    continue
                if contact == contacts[-1]:
                    break
                contact_type = contact.replace(':', '').strip()
                value = contacts[i + 1].replace('(', '').replace(') ',
                                                                 '-').strip()
                if 'Fax' in contact_type:
                    p.add_contact('fax', value, 'legislature')
                if 'Phone' in contact_type:
                    p.add_contact(contact_type, value, contact_type)
            yield p
コード例 #43
0
ファイル: people.py プロジェクト: fchagnon/scrapers-ca
def mayor_data(url, name):
  page = lxmlize(url)
  photo_url = urljoin(url, 
      page.xpath('string((//div[@id="contentcontainer"]//img)[1]/@src)'))
  contact_page = lxmlize(MAYOR_CONTACT_URL)
  email = contact_page.xpath('string(//a[contains(., "@")][1])')

  m = Legislator(name=name, post_id='Regina', role='Mayor')
  m.add_source(COUNCIL_PAGE)
  m.add_source(url)
  m.add_source(MAYOR_CONTACT_URL)
  m.add_contact('email', email, None)
  m.image = photo_url

  return m
コード例 #44
0
ファイル: people.py プロジェクト: fchagnon/scrapers-ca
  def get_people(self):
    contact_page = lxmlize(CONTACT_URL)
    email = contact_page.xpath('string(//a[starts-with(@href, "mailto:")])')
    page = lxmlize(COUNCIL_PAGE)
    for url in page.xpath('//a/@href[contains(., "members/")]'):
      page = lxmlize(url)
      role, name = page.xpath('string(//h1)').split(' ', 1)
      # image element is inserted by a script somewhere
      #photo_url = page.xpath('string(//span[@class="imageShadow"]/img/@src)')

      p = Legislator(name=name, post_id='Richmond', role=role)
      p.add_source(COUNCIL_PAGE)
      p.add_source(url)
      p.add_contact('email', email, None)
      yield p
コード例 #45
0
ファイル: people.py プロジェクト: fchagnon/scrapers-ca
  def get_people(self):
    page = lxmlize(COUNCIL_PAGE)
    rows = page.xpath('//div[@class="main-content"]//tr')[1:]
    for row in rows:
      name_cell = row.xpath('./td[1]')[0]
      last_name = name_cell.xpath('string(.//span[1])')
      first_name = name_cell.xpath('string(.//span[2])')
      name = '%s %s' % (first_name, last_name)
      constituency = row.xpath('string(./td[2])')
      province = row.xpath('string(./td[3])')
      party = row.xpath('string(./td[4])')

      url = name_cell.xpath('string(.//a/@href)')
      mp_page = lxmlize(url)
      email = mp_page.xpath('string(//span[@class="caucus"]/'
                            'a[contains(., "@")])')
      photo = mp_page.xpath('string(//div[@class="profile overview header"]//'
                            'img/@src)')

      m = Legislator(name=name, post_id=constituency, role='MP', chamber='lower', party=party)
      m.add_source(COUNCIL_PAGE)
      m.add_source(url)
      m.add_contact('email', email, None)
      m.image = photo

      m.add_contact('address', 'House of Commons\nOttawa ON  K1A 0A6', 'legislature')
      voice = mp_page.xpath('string(//div[@class="hilloffice"]//span[contains(text(), "Telephone:")])')
      if voice:
        m.add_contact('voice', voice.replace('Telephone: ', ''), 'legislature')
      fax = mp_page.xpath('string(//div[@class="hilloffice"]//span[contains(text(), "Fax:")])').replace('Fax: ', '')
      if fax:
        m.add_contact('fax', fax, 'legislature')

      for li in mp_page.xpath('//div[@class="constituencyoffices"]//li'):
        spans = li.xpath('./span[not(@class="spacer")]')
        m.add_contact('address', '\n'.join([
          spans[0].text_content(), # address
          spans[1].text_content(), # city, region
          spans[2].text_content(), # postal code
        ]), 'constituency')
        voice = li.xpath('string(./span[contains(text(), "Telephone:")])').replace('Telephone: ', '')
        if voice:
          m.add_contact('voice', voice, 'constituency')
        fax = li.xpath('string(./span[contains(text(), "Fax:")])').replace('Fax: ', '')
        if fax:
          m.add_contact('fax', fax, 'constituency')

      yield m
コード例 #46
0
ファイル: people.py プロジェクト: fchagnon/scrapers-ca
    def get_people(self):
        page = lxmlize(COUNCIL_PAGE)

        councillors = page.xpath(
            '//div[@id="ctl00_ContentPlaceHolder1_ContentBlock1"]//a/parent::p'
        )
        for councillor in councillors:
            if not councillor.text_content().strip():
                continue
            if 'Mayor' in councillor.text_content():
                name = councillor.text_content().replace('Mayor ', '')
                district = 'Haldimand County'
                role = 'Mayor'
            else:
                district, name = councillor.text_content().split(' - ')
                name = name.replace('Councillor', '').strip()
                district = district.strip()
                role = 'Councillor'

            url = councillor.xpath('.//a')[0].attrib['href']
            page = lxmlize(url)

            p = Legislator(name=name, post_id=district, role=role)
            p.add_source(COUNCIL_PAGE)
            p.add_source(url)

            p.image = page.xpath(
                '//div[@id="ctl00_ContentPlaceHolder1_ContentBlock1"]//tr[1]/td//img/@src'
            )[0]

            info = page.xpath(
                '//a[contains(@href, "mailto:")]/parent::*/text()')
            for i, field, in enumerate(info):
                if re.match(r'[0-9]+ [A-Z]', field):
                    address = field + ', ' + info[i + 1] + ', ' + info[i + 2]
                    p.add_contact('address', address, 'legislature')
                if re.findall(r'[0-9]{3} [0-9]{3} [0-9]{4}', field):
                    if 'Fax' in field:
                        num = field.replace('Fax: ',
                                            '').strip().replace(' ', '-')
                        p.add_contact('fax', num, 'legislature')
                    else:
                        num = field.replace('Telephone: ',
                                            '').strip().replace(' ', '-')
                        p.add_contact('voice', num, 'legislature')
            email = page.xpath('//a[contains(@href, "mailto:")]/text()')[0]
            p.add_contact('email', email, None)
            yield p
コード例 #47
0
ファイル: people.py プロジェクト: fchagnon/scrapers-ca
    def get_people(self):
        page = lxmlize(COUNCIL_PAGE)

        mayor = page.xpath('//td[@class="LeftLinksSectionMenu"]/a')[0]
        name = mayor.text_content().replace('Mayor', '').strip()
        url = mayor.attrib['href']
        mayor_page = lxmlize(url)
        p = Legislator(name=name, post_id='Westmount', role='Maire')
        p.add_source(COUNCIL_PAGE)
        p.add_source(url)
        mayor_info = mayor_page.xpath(
            '//div[@style="padding-right:10px;"]/table')[0]
        phone = mayor_info.xpath('.//tr[2]/td[2]')[0].text_content().replace(
            ' ', '-')
        fax = mayor_info.xpath('.//tr[3]/td[2]')[0].text_content().replace(
            ' ', '-')
        email = mayor_info.xpath('.//tr[4]/td[2]')[0].text_content().strip()
        p.add_contact('voice', phone, 'legislature')
        p.add_contact('fax', fax, 'legislature')
        p.add_contact('email', email, None)
        yield p

        councillors = page.xpath(
            '//td[@class="LeftLinksSectionMenu" and contains(@style, "border-bottom-style: dashed;")]/a'
        )
        for i, councillor in enumerate(councillors):
            name = councillor.text_content().strip()
            url = councillor.attrib['href']
            page = lxmlize(url)

            if page.xpath('boolean(.//div[@class="SectionTitle"][2])'):
                district = page.xpath('.//div[@class="SectionTitle"]')[
                    1].text_content().split('-')[0].strip()
            else:
                district = 'District ' + str(i + 1)

            info = page.xpath('.//div[@style="padding-right:10px;"]/table')[0]
            phone = info.xpath('.//tr[2]/td[2]')[0].text_content().replace(
                ' ', '-')
            email = info.xpath('.//tr[3]/td[2]')[0].text_content().strip()
            p = Legislator(name=name, post_id=district, role='Conseiller')
            p.add_source(COUNCIL_PAGE)
            p.add_source(url)
            p.image = info.xpath(
                './ancestor::td//div[not(@id="insert")]/img/@src')[0]
            p.add_contact('voice', phone, 'legislature')
            p.add_contact('email', email, None)
            yield p
コード例 #48
0
def get_details(url):
    page = lxmlize(url)
    image = page.xpath('string(//img[@class="portrait"]/@src)')
    phone = page.xpath('string(//dd[@class="numbers"]/text())').split(': ')[1]
    email_js = page.xpath('string(//dd/script)')
    email_addr = process_email(email_js)
    return image, phone, email_addr
コード例 #49
0
ファイル: people.py プロジェクト: fchagnon/scrapers-ca
  def get_people(self):

    tmpdir = tempfile.mkdtemp()
    page = lxmlize(COUNCIL_PAGE)

    mayor = page.xpath('//div[@class="box"]/p/text()')
    m_name = mayor[0].strip().split('.')[1].strip()
    m_phone = mayor[1].strip().split(':')[1].strip()

    m = Legislator(name=m_name, post_id='Saguenay', role='Maire')
    m.add_source(COUNCIL_PAGE)
    m.add_contact('voice', m_phone, 'legislature')

    yield m

    councillors = page.xpath('//div[@class="box"]//div')
    for councillor in councillors:
      district = councillor.xpath('./h3')[0].text_content().replace('#', '')
      name = councillor.xpath('.//p/text()')[0].encode('latin-1').decode('utf-8')
      name = name.replace('M. ', '').replace('Mme ', '').strip()
      phone = councillor.xpath('.//p/text()')[1].split(':')[1].strip().replace(' ', '-')
      email = councillor.xpath('.//a[contains(@href, "mailto:")]')[0].text_content()

      url = councillor.xpath('./p/a')[0].attrib['href']

      p = Legislator(name=name, post_id=district, role='Conseiller')
      p.add_source(COUNCIL_PAGE)

      p.add_contact('voice', phone, 'legislature')
      p.add_contact('email', email, None)
      yield p
コード例 #50
0
ファイル: people.py プロジェクト: fchagnon/scrapers-ca
    def get_people(self):
        page = lxmlize(COUNCIL_PAGE)

        councillor_trs = [
            tr for tr in page.xpath('//table//tr[1]') if len(tr) == 2
        ][:-1]
        for councillor_tr in councillor_trs:
            desc = [
                text.strip()
                for text in councillor_tr.xpath('.//text()[normalize-space()]')
                if text.strip()
            ]

            if len(desc) == 3:
                role = 'Maire'
                district = u'Saint-Jérôme'
            else:
                role = 'Conseiller'
                district = desc[0].replace(u'numéro ', '')

            name = desc[-3]
            phone = desc[-2]
            email = desc[-1]

            image = councillor_tr.xpath('string(.//img/@src)')[0]

            p = Legislator(name=name, post_id=district, role=role)
            p.add_source(COUNCIL_PAGE)
            p.image = image
            p.add_contact('voice', phone, 'legislature')
            p.add_contact('email', email, None)
            yield p
コード例 #51
0
ファイル: people.py プロジェクト: fchagnon/scrapers-ca
    def get_people(self):
        page = lxmlize(COUNCIL_PAGE)

        councillors = page.xpath('//div[@id="content"]//tr')

        for i, councillor in enumerate(councillors):
            if 'Maire' in councillor.text_content():
                name = councillor.xpath('./td')[1].text_content()
                district = 'Sainte-Anne-de-Bellevue'
                role = 'Maire'
            else:
                name = councillor.xpath('./td')[1].text_content()
                district = 'District ' + re.findall(
                    r'\d',
                    councillor.xpath('./td')[0].text_content())[0]
                role = 'Conseiller'

            p = Legislator(name=name, post_id=district, role=role)
            p.add_source(COUNCIL_PAGE)

            email = councillor.xpath('.//a')
            if email:
                email = email[0].attrib['href'].replace('mailto:', '')
                p.add_contact('email', email, None)
            yield p
コード例 #52
0
ファイル: people.py プロジェクト: fchagnon/scrapers-ca
  def get_people(self):
    page = lxmlize(COUNCIL_PAGE, user_agent='Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0)')

    yield self.scrape_mayor(page)

    councillors = page.xpath('//strong[contains(text(), "Councillor")]/parent::p|//b[contains(text(), "Councillor")]/parent::p')
    for councillor in councillors:

      name = councillor.xpath('./strong/text()|./b/text()')[0].replace('Councillor', '').strip()
      district = re.findall('(?<=Ward \d, ).*', councillor.text_content())[0].strip()

      p = Legislator(name=name, post_id=district, role='Councillor')
      p.add_source(COUNCIL_PAGE)

      p.image = councillor.xpath('.//img/@src')[0]

      phone = re.findall(r'Phone(.*)', councillor.text_content())
      node = councillor
      while not phone:
        node = node.xpath('./following-sibling::p')[1]
        phone = re.findall(r'Phone(.*)', node.text_content())
      phone = phone[0].strip()

      email = councillor.xpath('.//a[contains(@href, "mailto:")]')
      if not email:
        email = councillor.xpath('./following-sibling::p//a[contains(@href, "mailto")]')
      email = email[0].text_content()

      if len(re.sub(r'\D', '', phone)) == 7:
        phone = '902-%s' % phone
      p.add_contact('voice', phone, 'legislature')
      p.add_contact('email', email, None)

      yield p
コード例 #53
0
  def get_people(self):
    page = lxmlize(COUNCIL_PAGE)

    mayor = page.xpath('.//div[@class="item-page clearfix"]//table[1]//p')[1]
    name = mayor.xpath('.//strong/text()')[0]

    p = Legislator(name=name, post_id='Pointe-Claire', role='Maire')
    p.add_source(COUNCIL_PAGE)

    phone = re.findall(r'[0-9]{3}[ -][0-9]{3}-[0-9]{4}', mayor.text_content())[0].replace(' ', '-')
    p.add_contact('voice', phone, 'legislature')
    yield p

    rows = page.xpath('//tr')
    for i, row in enumerate(rows):
      if i % 2 == 0:
        continue
      councillors = row.xpath('./td')
      for j, councillor in enumerate(councillors):
        name = councillor.text_content()
        # rows[i + 1].xpath('.//td//a[contains(@href, "maps")]/text()')[j] # district number
        district = rows[i + 1].xpath('.//td/p[1]/text()')[j].replace(' / ', '/')

        p = Legislator(name=name, post_id=district, role='Conseiller')
        p.add_source(COUNCIL_PAGE)
        p.image = councillor.xpath('.//img/@src')[0]

        phone = re.findall(r'[0-9]{3}[ -][0-9]{3}-[0-9]{4}', rows[i + 1].xpath('.//td')[j].text_content())[0].replace(' ', '-')

        p.add_contact('voice', phone, 'legislature')

        yield p