Python AggregationLegislator Examples, utils.AggregationLegislator Python Examples

Example #1

0

Show file

File: people.py Project: fchagnon/scrapers-ca

    def get_people(self):
        page = lxmlize(COUNCIL_PAGE)

        districts = page.xpath(
            '//div[@id="ctl00_PublicContent_divSearchContent"]//tr')[5::3]
        for district in districts:
            title = district.xpath('.//td//text()')
            if len(title[0]) > 1:
                title = title[0]
            else:
                title = ''.join(title[:2])

            # @todo Need to distinguish between, e.g., R.M. and Town
            title = title.title()
            organization = Organization(
                name=title + ' Municipal Council',
                classification='legislature',
                jurisdiction_id=self.jurisdiction.jurisdiction_id)
            organization.add_source(COUNCIL_PAGE)
            yield organization

            contact = district.xpath('.//td/text()')
            address = ' '.join(contact[:4])
            address = re.sub(r'(Fax:.*)', '', address).strip()
            contact = [x for x in contact if 'Fax' in x]
            fax = contact[0].split(':')[1].strip()

            phone = district.xpath('.//b[contains(text(), "Phone")]/text()'
                                   )[0].split(':')[1].strip()
            email = district.xpath(
                './/a[contains(@href, "mailto:")]/text()')[0].strip()

            councillors = district.xpath('.//td[3]/text()')
            positions = district.xpath('.//td[2]/b/text()')
            for i, councillor in enumerate(councillors):
                p = Legislator(name=councillor, post_id=title)
                p.add_source(COUNCIL_PAGE)

                if i >= 2:
                    membership = p.add_membership(organization,
                                                  role='Councillor')
                else:
                    membership = p.add_membership(
                        organization, role=positions[i]
                    )  # @todo "Resident Administrator & Chief Administrative Officer" is split on two lines

                membership.post_id = title
                membership.add_contact_detail('address', address,
                                              'legislature')
                membership.add_contact_detail('fax', fax, 'legislature')
                membership.add_contact_detail('voice', phone, 'legislature')
                membership.add_contact_detail('email', email, None)
                yield p

Example #2

0

Show file

File: people.py Project: fchagnon/scrapers-ca

  def get_people(self):
    response = urllib2.urlopen(COUNCIL_PAGE).read()
    pdf = open('/tmp/ns.pdf', 'w')
    pdf.write(response)
    pdf.close()

    data = subprocess.check_output(['pdftotext', '/tmp/ns.pdf', '-'])
    emails = re.findall(r'(?<=E-mail: ).+', data)
    data = re.split(r'Mayor |Warden ', data)[1:]
    for i, mayor in enumerate(data):
      lines = mayor.splitlines(True)
      name = lines.pop(0).strip()
      if name == "Jim Smith":
        continue
      district = lines.pop(0).strip()
      if not re.findall(r'[0-9]', lines[0]):
        district = district + ' ' + lines.pop(0).strip()

      org = Organization(name=district + ' Municipal Council', classification='legislature', jurisdiction_id=self.jurisdiction.jurisdiction_id)
      org.add_source(COUNCIL_PAGE)
      yield org

      p = Legislator(name=name, post_id=district)
      p.add_source(COUNCIL_PAGE)
      membership = p.add_membership(org, role='Mayor', post_id=district)

      address = lines.pop(0).strip() + ', ' + lines.pop(0).strip()
      if not 'Phone' in lines[0]:
        address = address + ', ' + lines.pop(0).strip()

      if not 'Phone' in lines[0]:
        address = address + ', ' + lines.pop(0).strip()

      phone = lines.pop(0).split(':')[1].strip()
      if 'Fax' in lines.pop(0):
        fax = lines.pop(0)

      membership.add_contact_detail('address', address, 'legislature')
      membership.add_contact_detail('voice', phone, 'legislature')
      membership.add_contact_detail('fax', fax, 'legislature')
      # @todo emails are being assigned incorrectly, e.g. Town of Berwick picks
      # up Cape Breton Regional Municipality and Region of Queens Municipality
      for i, email in enumerate(emails):
        regex = name.split()[-1].lower() + '|' + '|'.join(district.split()[-2:]).replace('of', '').lower()
        regex = regex.replace('||', '|')
        matches = re.findall(r'%s' % regex, email)
        if matches:
          membership.add_contact_detail('email', emails.pop(i), None)
      yield p

    os.system('rm /tmp/ns.pdf')

Example #3

0

Show file

File: people.py Project: fchagnon/scrapers-ca

  def get_people(self):
    page = lxmlize(COUNCIL_PAGE)

    districts = page.xpath('//div[@id="ctl00_PublicContent_divSearchContent"]//tr')[5::3]
    for district in districts:
      title = district.xpath('.//td//text()')
      if len(title[0]) > 1:
        title = title[0]
      else:
        title = ''.join(title[:2])

      # @todo Need to distinguish between, e.g., R.M. and Town
      title = title.title()
      organization = Organization(name=title + ' Municipal Council', classification='legislature', jurisdiction_id=self.jurisdiction.jurisdiction_id)
      organization.add_source(COUNCIL_PAGE)
      yield organization

      contact = district.xpath('.//td/text()')
      address = ' '.join(contact[:4])
      address = re.sub(r'(Fax:.*)', '', address).strip()
      contact = [x for x in contact if 'Fax' in x]
      fax = contact[0].split(':')[1].strip()

      phone = district.xpath('.//b[contains(text(), "Phone")]/text()')[0].split(':')[1].strip()
      email = district.xpath('.//a[contains(@href, "mailto:")]/text()')[0].strip()

      councillors = district.xpath('.//td[3]/text()')
      positions = district.xpath('.//td[2]/b/text()')
      for i, councillor in enumerate(councillors):
        p = Legislator(name=councillor, post_id=title)
        p.add_source(COUNCIL_PAGE)

        if i >= 2:
          membership = p.add_membership(organization, role='Councillor')
        else:
          membership = p.add_membership(organization, role=positions[i])  # @todo "Resident Administrator & Chief Administrative Officer" is split on two lines

        membership.post_id = title
        membership.add_contact_detail('address', address, 'legislature')
        membership.add_contact_detail('fax', fax, 'legislature')
        membership.add_contact_detail('voice', phone, 'legislature')
        membership.add_contact_detail('email', email, None)
        yield p

Example #4

0

Show file

File: people.py Project: fchagnon/scrapers-ca

  def get_people(self):
    page = lxmlize(COUNCIL_PAGE)

    councillors = page.xpath('//div[@class="entry-content"]//p/strong')
    for councillor in councillors:
      district = councillor.xpath('./ancestor::p/preceding-sibling::h2')[-1].text_content().split('–'.decode('utf-8'))[0]
      name = ' '.join(councillor.text_content().split()[-2:]).replace('-Â'.decode('utf-8'), '')
      role = councillor.text_content().replace(name, '').split('-')[0]
      if 'SAO' in role or not role:
        continue

      org = Organization(name=district + ' Municipal Council', classification='legislature', jurisdiction_id=self.jurisdiction.jurisdiction_id)
      org.add_source(COUNCIL_PAGE)
      yield org

      p = Legislator(name=name, post_id=district)
      p.add_source(COUNCIL_PAGE)
      membership = p.add_membership(org, role=role, post_id=district)

      info = councillor.xpath('./ancestor::p/text()')
      for contact in info:
        if 'NT' in contact:
          membership.add_contact_detail('address', contact.strip(), 'legislature')
        if 'Tel' in contact:
          contact = contact.replace('Tel. ', '').replace('(', '').replace(') ', '-').strip()
          membership.add_contact_detail('voice', contact, 'legislature')
        if 'Fax' in contact:
          contact = contact.replace('Fax ', '').replace('(', '').replace(') ', '-').strip()
          membership.add_contact_detail('fax', contact, 'legislature')
      email = councillor.xpath('./parent::p//a[contains(@href, "mailto:")]/text()')[0]
      membership.add_contact_detail('email', email, None)

      if 'Website' in councillor.xpath('./parent::p')[0].text_content():
        p.add_link(councillor.xpath('./parent::p//a')[1].attrib['href'], None)
      yield p

Example #5

0

Show file

  def get_people(self):
    page = lxmlize(COUNCIL_PAGE)

    districts = page.xpath('//div[@id="left-content" or @id="right-content"]//a')
    for district in districts:
      url = district.attrib['href']
      page = lxmlize(url)

      org = Organization(name=district.text_content() + ' Council', classification='legislature', jurisdiction_id=self.jurisdiction.jurisdiction_id)
      org.add_source(url)
      yield org

      info = page.xpath('//div[@style="WIDTH:750"]/dl')
      for contact in info:
        contact_type = contact.xpath('./dt')[0].text_content()
        contact = contact.xpath('./dd')[0].text_content().replace('(', '').replace(') ', '-')
        if 'Officials' in contact_type:
          break
        if 'Tel' in contact_type:
          phone = contact
        if 'Fac' in contact_type:
          fax = contact
        if 'Address' in contact_type:
          address = contact
        if 'Email' in contact_type:
          email = contact
        if 'Website' in contact_type:
          site = contact

      councillors = page.xpath('//div[@style="WIDTH:750"]/dl/dt[contains(text(), "Elected Officials")]/parent::dl/dd/pre/text()')[0].splitlines(True)
      for councillor in councillors:
        name = councillor.replace('(Mayor)', '').replace('(Deputy Mayor)', '').replace('(Chairperson)', '').strip()
        role = re.sub(r'\(|\)', '', councillor.replace(name, '').strip())
        if not role:
          role = 'Councillor'
        p = Legislator(name=name, post_id=district.text_content())
        p.add_source(COUNCIL_PAGE)
        p.add_source(url)
        membership = p.add_membership(org, role=role, post_id=district.text_content())
        membership.add_contact_detail('voice', clean_telephone_number(phone), 'legislature')
        membership.add_contact_detail('fax', clean_telephone_number(fax), 'legislature')
        membership.add_contact_detail('address', clean_address(address), 'legislature')
        membership.add_contact_detail('email', email, None)
        if site:
          p.add_link(site, None)
        yield p

Example #6

0

Show file

    def get_people(self):
        page = lxmlize(COUNCIL_PAGE)

        councillors = page.xpath('//div[@class="entry-content"]//p/strong')
        for councillor in councillors:
            district = councillor.xpath('./ancestor::p/preceding-sibling::h2'
                                        )[-1].text_content().split(
                                            '–'.decode('utf-8'))[0]
            name = ' '.join(councillor.text_content().split()[-2:]).replace(
                '-Â'.decode('utf-8'), '')
            role = councillor.text_content().replace(name, '').split('-')[0]
            if 'SAO' in role or not role:
                continue

            org = Organization(
                name=district + ' Municipal Council',
                classification='legislature',
                jurisdiction_id=self.jurisdiction.jurisdiction_id)
            org.add_source(COUNCIL_PAGE)
            yield org

            p = Legislator(name=name, post_id=district)
            p.add_source(COUNCIL_PAGE)
            membership = p.add_membership(org, role=role, post_id=district)

            info = councillor.xpath('./ancestor::p/text()')
            for contact in info:
                if 'NT' in contact:
                    membership.add_contact_detail('address', contact.strip(),
                                                  'legislature')
                if 'Tel' in contact:
                    contact = contact.replace('Tel. ',
                                              '').replace('(', '').replace(
                                                  ') ', '-').strip()
                    membership.add_contact_detail('voice', contact,
                                                  'legislature')
                if 'Fax' in contact:
                    contact = contact.replace('Fax ',
                                              '').replace('(', '').replace(
                                                  ') ', '-').strip()
                    membership.add_contact_detail('fax', contact,
                                                  'legislature')
            email = councillor.xpath(
                './parent::p//a[contains(@href, "mailto:")]/text()')[0]
            membership.add_contact_detail('email', email, None)

            if 'Website' in councillor.xpath('./parent::p')[0].text_content():
                p.add_link(
                    councillor.xpath('./parent::p//a')[1].attrib['href'], None)
            yield p

Example #7

0

Show file

    def get_people(self):
        page = lxmlize(COUNCIL_PAGE)
        types = page.xpath(
            '//div[@class="bluearrow shaded bottomborder "][1]/ul/li/a/@href'
        )[:4]
        for org_type, link in enumerate(types):
            page = lxmlize(link)
            district_urls = page.xpath(
                '//div[@class="parbase list section cplist"]/table/tr/td[1]/b/a/@href'
            )
            for district_url in district_urls:
                page = lxmlize(district_url)
                district = page.xpath('//div[@class="pageHeader"]/h1/text()'
                                      )[0].split(' - ')[1].strip()

                org = Organization(
                    name=district + org_types[org_type],
                    classification='legislature',
                    jurisdiction_id=self.jurisdiction.jurisdiction_id)
                org.add_source(district_url)
                yield org

                address = ', '.join(
                    page.xpath('//div[@class="left_contents"]/p[1]/text()'))
                contacts = page.xpath(
                    '//div[@class="left_contents"]/p[b[text() = "Contact"]]/text()'
                )
                phone = contacts[0].split(':')[1].strip().replace(' ', '-')
                fax = contacts[1].split(':')[1].strip().replace(' ', '-')
                email = page.xpath(
                    '//div[@class="left_contents"]//a[contains(@href, "mailto:")]'
                )
                if email:
                    email = email[0].text_content()

                site = page.xpath(
                    '//div[@class="left_contents"]//a[not(contains(@href,"mailto:"))]'
                )
                if site:
                    site = site[0].text_content()

                councillors = page.xpath(
                    '//div[@class="right_contents"]//p/text()')
                for i, councillor in enumerate(councillors):
                    if 'Vacant' in councillor:
                        continue
                    p = Legislator(name=councillor, post_id=district)
                    p.add_source(COUNCIL_PAGE)
                    p.add_source(link)
                    p.add_source(district_url)

                    if i == 0:
                        membership = p.add_membership(org, role='Mayor')
                    else:
                        membership = p.add_membership(org, role='Councillor')

                    membership.post_id = district
                    membership.add_contact_detail('address', address,
                                                  'legislature')
                    if phone:
                        membership.add_contact_detail('voice', phone,
                                                      'legislature')
                    if fax:
                        membership.add_contact_detail('fax', fax,
                                                      'legislature')
                    if email:
                        membership.add_contact_detail('email', email, None)
                    if site:
                        p.add_link(site, None)
                    yield p

Example #8

0

Show file

File: people.py Project: fchagnon/scrapers-ca

    def get_people(self):
        page = lxmlize(COUNCIL_PAGE)
        url = page.xpath(
            '//a[contains(text(),"Municipal Directory")]/@href')[0]

        response = urllib2.urlopen(url).read()
        pdf = open('/tmp/nl.pdf', 'w')
        pdf.write(response)
        pdf.close()

        data = subprocess.check_output(
            ['pdftotext', '-layout', '/tmp/nl.pdf', '-'])
        pages = data.split('Municipal Directory')[1:]
        for page in pages:
            page = page.splitlines(True)
            column_index = {}
            for line in page:
                if 'Official Name' in line:
                    column_index['dist_end'] = re.search('Region',
                                                         line).start()
                    column_index['name_start'] = re.search('Mayor',
                                                           line).start() + 1
                    column_index['name_end'] = re.search('Clerk',
                                                         line).start() - 1
                    column_index['phone_start'] = re.search('Line 1',
                                                            line).start()
                    column_index['phone_end'] = re.search('Line 2',
                                                          line).start() - 1
                    column_index['fax_start'] = re.search('Fax', line).start()
                    column_index['fax_end'] = re.search('E-mail',
                                                        line).start() - 2
                    column_index['email_start'] = column_index['fax_end'] + 1
                    column_index['email_end'] = re.search('Address',
                                                          line).start() - 1
                    column_index[
                        'address_start'] = column_index['email_end'] + 1
                    column_index['address_end'] = re.search('Days',
                                                            line).start() - 1
                    break
            for line in page:
                if 'Official Name' in line or not line.strip():
                    continue
                district = line[:column_index['dist_end']]
                name = line[column_index['name_start']:
                            column_index['name_end']].strip()
                phone = line[column_index['phone_start']:
                             column_index['phone_end']].strip().replace(
                                 '(', '').replace(') ', '-')
                fax = line[column_index['fax_start']:
                           column_index['fax_end']].strip().replace(
                               '(', '').replace(') ', '-')
                email = line[column_index['email_start']:
                             column_index['email_end']].strip()
                address = line[column_index['address_start']:
                               column_index['address_end']].strip()
                address = re.sub(r'\s{2,}', ', ', address)
                if not name or not district:
                    continue

                org = Organization(
                    name=district + ' Municipal Council',
                    classification='legislature',
                    jurisdiction_id=self.jurisdiction.jurisdiction_id)
                org.add_source(COUNCIL_PAGE)
                org.add_source(url)
                yield org

                p = Legislator(name=name, post_id=district)
                p.add_source(COUNCIL_PAGE)
                p.add_source(url)
                membership = p.add_membership(org,
                                              role='Mayor',
                                              post_id=district)
                if phone:
                    membership.add_contact_detail('voice', phone,
                                                  'legislature')
                # Im excluding fax because that column isn't properly aligned
                # if fax:
                #   membership.add_contact_detail('fax', fax, None)
                if email:
                    membership.add_contact_detail('email', email, None)
                if address:
                    membership.add_contact_detail('address', address,
                                                  'legislature')
                yield p
        os.system('rm /tmp/nl.pdf')

Example #9

0

Show file

File: people.py Project: fchagnon/scrapers-ca

  def get_people(self):
    response = urllib2.urlopen(COUNCIL_PAGE).read()
    pdf = open('/tmp/yt.pdf', 'w')
    pdf.write(response)
    pdf.close()

    data = subprocess.check_output(['pdftotext', '-layout', '/tmp/yt.pdf', '-'])
    data = re.split(r'\n\s*\n', data)
    for municipality in data:

      if not 'Councillors' in municipality:
        continue
      lines = municipality.split('\n')
      if 'Page' in lines[0]:
        lines.pop(0)
        if not lines[0].strip():
          lines.pop(0)
      col1end = re.search(r'\s{2,}(\w)', lines[0].strip()).end()
      col2end = re.search(r':\s{2,}(\w)', lines[0].strip()).end()

      if 'Council' in lines[1]:
        address = lines[2][:col1end - 1].strip() + ' ' + lines[3][:col1end - 1].strip()
        district = lines[0][:col1end - 1].strip() + ' ' + lines[1][:col1end - 1].strip()
      else:
        address = lines[1][:col1end - 1].strip() + ' ' + lines[2][:col1end - 1].strip()
        district = lines[0][:col1end - 1].strip()

      organization = Organization(name=district + ' Council', classification='legislature', jurisdiction_id=self.jurisdiction.jurisdiction_id)
      organization.add_source(COUNCIL_PAGE)
      yield organization

      phone = re.findall(r'(?<=Phone: )\(?(\d{3}[\)-] ?\d{3}-\d{4})', municipality)[0].replace(') ', '-')
      email = re.findall(r'(?<=E-mail:) (\S*)', municipality)[0]
      fax = None
      if 'Fax' in municipality:
        fax = re.findall(r'(?<=Fax: )\(?(\d{3}[\)-] ?\d{3}-\d{4})', municipality)[0].replace(') ', '-')
      website = None
      if 'Website' in municipality:
        website = re.findall(r'((http:\/\/|www.)(\S*))', municipality)[0][0]

      councillor_or_mayor = False
      for line in lines:
        if 'Mayor:' in line:
          councillor_or_mayor = True
          role = 'Mayor'
          continue
        if 'Councillors' in line:
          councillor_or_mayor = True
          role = 'Councillor'
          continue
        if councillor_or_mayor:
          councillor = line[col1end - 1:col2end - 1].strip()
          if not councillor:
            continue
          p = Legislator(name=councillor, post_id=district)
          p.add_source(COUNCIL_PAGE)
          membership = p.add_membership(organization, role=role, post_id=district)
          membership.add_contact_detail('address', address, 'legislature')
          membership.add_contact_detail('voice', phone, 'legislature')
          membership.add_contact_detail('email', email, None)
          if fax:
            membership.add_contact_detail('fax', fax, 'legislature')
          if website:
            p.add_link(website, None)
          yield p

    os.system('rm /tmp/yt.pdf')

Example #10

0

Show file

    def get_people(self):
        response = urllib2.urlopen(COUNCIL_PAGE).read()
        pdf = open('/tmp/yt.pdf', 'w')
        pdf.write(response)
        pdf.close()

        data = subprocess.check_output(
            ['pdftotext', '-layout', '/tmp/yt.pdf', '-'])
        data = re.split(r'\n\s*\n', data)
        for municipality in data:

            if not 'Councillors' in municipality:
                continue
            lines = municipality.split('\n')
            if 'Page' in lines[0]:
                lines.pop(0)
                if not lines[0].strip():
                    lines.pop(0)
            col1end = re.search(r'\s{2,}(\w)', lines[0].strip()).end()
            col2end = re.search(r':\s{2,}(\w)', lines[0].strip()).end()

            if 'Council' in lines[1]:
                address = lines[2][:col1end -
                                   1].strip() + ' ' + lines[3][:col1end -
                                                               1].strip()
                district = lines[0][:col1end -
                                    1].strip() + ' ' + lines[1][:col1end -
                                                                1].strip()
            else:
                address = lines[1][:col1end -
                                   1].strip() + ' ' + lines[2][:col1end -
                                                               1].strip()
                district = lines[0][:col1end - 1].strip()

            organization = Organization(
                name=district + ' Council',
                classification='legislature',
                jurisdiction_id=self.jurisdiction.jurisdiction_id)
            organization.add_source(COUNCIL_PAGE)
            yield organization

            phone = re.findall(r'(?<=Phone: )\(?(\d{3}[\)-] ?\d{3}-\d{4})',
                               municipality)[0].replace(') ', '-')
            email = re.findall(r'(?<=E-mail:) (\S*)', municipality)[0]
            fax = None
            if 'Fax' in municipality:
                fax = re.findall(r'(?<=Fax: )\(?(\d{3}[\)-] ?\d{3}-\d{4})',
                                 municipality)[0].replace(') ', '-')
            website = None
            if 'Website' in municipality:
                website = re.findall(r'((http:\/\/|www.)(\S*))',
                                     municipality)[0][0]

            councillor_or_mayor = False
            for line in lines:
                if 'Mayor:' in line:
                    councillor_or_mayor = True
                    role = 'Mayor'
                    continue
                if 'Councillors' in line:
                    councillor_or_mayor = True
                    role = 'Councillor'
                    continue
                if councillor_or_mayor:
                    councillor = line[col1end - 1:col2end - 1].strip()
                    if not councillor:
                        continue
                    p = Legislator(name=councillor, post_id=district)
                    p.add_source(COUNCIL_PAGE)
                    membership = p.add_membership(organization,
                                                  role=role,
                                                  post_id=district)
                    membership.add_contact_detail('address', address,
                                                  'legislature')
                    membership.add_contact_detail('voice', phone,
                                                  'legislature')
                    membership.add_contact_detail('email', email, None)
                    if fax:
                        membership.add_contact_detail('fax', fax,
                                                      'legislature')
                    if website:
                        p.add_link(website, None)
                    yield p

        os.system('rm /tmp/yt.pdf')

Example #11

0

Show file

File: people.py Project: fchagnon/scrapers-ca

  def get_people(self):
    page = lxmlize(COUNCIL_PAGE)
    url = page.xpath('//a[contains(text(),"Municipal Directory")]/@href')[0]

    response = urllib2.urlopen(url).read()
    pdf = open('/tmp/nl.pdf', 'w')
    pdf.write(response)
    pdf.close()

    data = subprocess.check_output(['pdftotext', '-layout', '/tmp/nl.pdf', '-'])
    pages = data.split('Municipal Directory')[1:]
    for page in pages:
      page = page.splitlines(True)
      column_index = {}
      for line in page:
        if 'Official Name' in line:
          column_index['dist_end'] = re.search('Region', line).start()
          column_index['name_start'] = re.search('Mayor', line).start() + 1
          column_index['name_end'] = re.search('Clerk', line).start() - 1
          column_index['phone_start'] = re.search('Line 1', line).start()
          column_index['phone_end'] = re.search('Line 2', line).start() - 1
          column_index['fax_start'] = re.search('Fax', line).start()
          column_index['fax_end'] = re.search('E-mail', line).start() - 2
          column_index['email_start'] = column_index['fax_end'] + 1
          column_index['email_end'] = re.search('Address', line).start() - 1
          column_index['address_start'] = column_index['email_end'] + 1
          column_index['address_end'] = re.search('Days', line).start() - 1
          break
      for line in page:
        if 'Official Name' in line or not line.strip():
          continue
        district = line[:column_index['dist_end']]
        name = line[column_index['name_start']:column_index['name_end']].strip()
        phone = line[column_index['phone_start']:column_index['phone_end']].strip().replace('(', '').replace(') ', '-')
        fax = line[column_index['fax_start']:column_index['fax_end']].strip().replace('(', '').replace(') ', '-')
        email = line[column_index['email_start']:column_index['email_end']].strip()
        address = line[column_index['address_start']:column_index['address_end']].strip()
        address = re.sub(r'\s{2,}', ', ', address)
        if not name or not district:
          continue

        org = Organization(name=district + ' Municipal Council', classification='legislature', jurisdiction_id=self.jurisdiction.jurisdiction_id)
        org.add_source(COUNCIL_PAGE)
        org.add_source(url)
        yield org

        p = Legislator(name=name, post_id=district)
        p.add_source(COUNCIL_PAGE)
        p.add_source(url)
        membership = p.add_membership(org, role='Mayor', post_id=district)
        if phone:
          membership.add_contact_detail('voice', phone, 'legislature')
        # Im excluding fax because that column isn't properly aligned
        # if fax:
        #   membership.add_contact_detail('fax', fax, None)
        if email:
          membership.add_contact_detail('email', email, None)
        if address:
          membership.add_contact_detail('address', address, 'legislature')
        yield p
    os.system('rm /tmp/nl.pdf')

Example #12

0

Show file

File: people.py Project: fchagnon/scrapers-ca

    def get_people(self):
        response = urllib2.urlopen(COUNCIL_PAGE).read()
        pdf = open('/tmp/sk.pdf', 'w')
        pdf.write(response)
        pdf.close()

        data = subprocess.check_output(
            ['pdftotext', '-layout', '/tmp/sk.pdf', '-'])

        data = data.splitlines(True)
        pages = []
        page = []
        for line in data:
            if line.strip(
            ) and not 'Page' in line and not 'CITIES' in line and not 'NORTHERN TOWNS, VILLAGES' in line:
                page.append(line)
            elif page:
                pages.append(page)
                page = []

        districts = []
        for page in pages:
            index = re.search(r'(\s{6,})', page[0])
            if index:
                index = index.end() - 1
            else:
                index = -1
            dist1 = []
            dist2 = []
            for line in page:
                dist1.append(line[:index].strip())
                dist2.append(line[index:].strip())
            districts.append(dist1)
            districts.append(dist2)

        for district in districts:

            district_name = district.pop(0).split(',')[0].title()

            org = Organization(
                name=district_name + ' Council',
                classification='legislature',
                jurisdiction_id=self.jurisdiction.jurisdiction_id)
            org.add_source(COUNCIL_PAGE)

            councillors = []
            contacts = {}
            for i, line in enumerate(district):
                if 'Phone' in line:
                    phone = line.split(':')[1].replace('(', '').replace(
                        ') ', '-').strip()
                    if phone:
                        contacts['voice'] = phone
                if 'Fax' in line:
                    fax = line.split(':')[1].replace('(',
                                                     '').replace(') ',
                                                                 '-').strip()
                    if fax:
                        contacts['fax'] = fax
                if 'E-Mail' in line:
                    email = line.split(':')[1].strip()
                    if email:
                        contacts['email'] = email
                if 'Address' in line and line.split(':')[1].strip():
                    address = line.split(':')[1].strip() + ', ' + ', '.join(
                        district[i + 1:]).replace(' ,', '')
                    contacts['address'] = address
                if 'Mayor' in line or 'Councillor' in line or 'Alderman' in line:
                    councillor = line.split(':')[1].replace('Mr.', '').replace(
                        'Mrs.',
                        '').replace('Ms.',
                                    '').replace('His Worship',
                                                '').replace('Her Worship',
                                                            '').strip()
                    role = line.split(':')[0].strip()
                    if councillor:
                        councillors.append([councillor, role])

            if not councillors:
                continue
            yield org
            for councillor in councillors:
                p = Legislator(name=councillor[0], post_id=district_name)
                p.add_source(COUNCIL_PAGE)
                membership = p.add_membership(org,
                                              role=councillor[1],
                                              post_id=district_name)

                for key, value in contacts.iteritems():
                    membership.add_contact_detail(
                        key, value, None if key == 'email' else 'legislature')
                yield p
        os.system('rm /tmp/sk.pdf')

Example #13

0

Show file

File: people.py Project: fchagnon/scrapers-ca

  def get_people(self):
    response = urllib2.urlopen(COUNCIL_PAGE).read()
    pdf = open('/tmp/sk.pdf', 'w')
    pdf.write(response)
    pdf.close()

    data = subprocess.check_output(['pdftotext', '-layout', '/tmp/sk.pdf', '-'])

    data = data.splitlines(True)
    pages = []
    page = []
    for line in data:
      if line.strip() and not 'Page' in line and not 'CITIES' in line and not 'NORTHERN TOWNS, VILLAGES' in line:
        page.append(line)
      elif page:
        pages.append(page)
        page = []

    districts = []
    for page in pages:
      index = re.search(r'(\s{6,})', page[0])
      if index:
        index = index.end() - 1
      else:
        index = -1
      dist1 = []
      dist2 = []
      for line in page:
        dist1.append(line[:index].strip())
        dist2.append(line[index:].strip())
      districts.append(dist1)
      districts.append(dist2)

    for district in districts:

      district_name = district.pop(0).split(',')[0].title()

      org = Organization(name=district_name + ' Council', classification='legislature', jurisdiction_id=self.jurisdiction.jurisdiction_id)
      org.add_source(COUNCIL_PAGE)

      councillors = []
      contacts = {}
      for i, line in enumerate(district):
        if 'Phone' in line:
          phone = line.split(':')[1].replace('(', '').replace(') ', '-').strip()
          if phone:
            contacts['voice'] = phone
        if 'Fax' in line:
          fax = line.split(':')[1].replace('(', '').replace(') ', '-').strip()
          if fax:
            contacts['fax'] = fax
        if 'E-Mail' in line:
          email = line.split(':')[1].strip()
          if email:
            contacts['email'] = email
        if 'Address' in line and line.split(':')[1].strip():
          address = line.split(':')[1].strip() + ', ' + ', '.join(district[i + 1:]).replace(' ,', '')
          contacts['address'] = address
        if 'Mayor' in line or 'Councillor' in line or 'Alderman' in line:
          councillor = line.split(':')[1].replace('Mr.', '').replace('Mrs.', '').replace('Ms.', '').replace('His Worship', '').replace('Her Worship', '').strip()
          role = line.split(':')[0].strip()
          if councillor:
            councillors.append([councillor, role])

      if not councillors:
        continue
      yield org
      for councillor in councillors:
        p = Legislator(name=councillor[0], post_id=district_name)
        p.add_source(COUNCIL_PAGE)
        membership = p.add_membership(org, role=councillor[1], post_id=district_name)

        for key, value in contacts.iteritems():
          membership.add_contact_detail(key, value, None if key == 'email' else 'legislature')
        yield p
    os.system('rm /tmp/sk.pdf')

Example #14

0

Show file

File: people.py Project: fchagnon/scrapers-ca

  def get_people(self):
    page = lxmlize(COUNCIL_PAGE)
    types = page.xpath('//div[@class="bluearrow shaded bottomborder "][1]/ul/li/a/@href')[:4]
    for org_type, link in enumerate(types):
      page = lxmlize(link)
      district_urls = page.xpath('//div[@class="parbase list section cplist"]/table/tr/td[1]/b/a/@href')
      for district_url in district_urls:
        page = lxmlize(district_url)
        district = page.xpath('//div[@class="pageHeader"]/h1/text()')[0].split(' - ')[1].strip()

        org = Organization(name=district + org_types[org_type], classification='legislature', jurisdiction_id=self.jurisdiction.jurisdiction_id)
        org.add_source(district_url)
        yield org

        address = ', '.join(page.xpath('//div[@class="left_contents"]/p[1]/text()'))
        contacts = page.xpath('//div[@class="left_contents"]/p[b[text() = "Contact"]]/text()')
        phone = contacts[0].split(':')[1].strip().replace(' ', '-')
        fax = contacts[1].split(':')[1].strip().replace(' ', '-')
        email = page.xpath('//div[@class="left_contents"]//a[contains(@href, "mailto:")]')
        if email:
          email = email[0].text_content()

        site = page.xpath('//div[@class="left_contents"]//a[not(contains(@href,"mailto:"))]')
        if site:
          site = site[0].text_content()

        councillors = page.xpath('//div[@class="right_contents"]//p/text()')
        for i, councillor in enumerate(councillors):
          if 'Vacant' in councillor:
            continue
          p = Legislator(name=councillor, post_id=district)
          p.add_source(COUNCIL_PAGE)
          p.add_source(link)
          p.add_source(district_url)

          if i == 0:
            membership = p.add_membership(org, role='Mayor')
          else:
            membership = p.add_membership(org, role='Councillor')

          membership.post_id = district
          membership.add_contact_detail('address', address, 'legislature')
          if phone:
            membership.add_contact_detail('voice', phone, 'legislature')
          if fax:
            membership.add_contact_detail('fax', fax, 'legislature')
          if email:
            membership.add_contact_detail('email', email, None)
          if site:
            p.add_link(site, None)
          yield p