コード例 #1
0
ファイル: people.py プロジェクト: fchagnon/scrapers-ca
  def get_people(self):
    page = lxmlize(COUNCIL_PAGE)

    councillors = page.xpath('//div[@class="entry-content"]//p/strong')
    for councillor in councillors:
      district = councillor.xpath('./ancestor::p/preceding-sibling::h2')[-1].text_content().split('–'.decode('utf-8'))[0]
      name = ' '.join(councillor.text_content().split()[-2:]).replace('-Â'.decode('utf-8'), '')
      role = councillor.text_content().replace(name, '').split('-')[0]
      if 'SAO' in role or not role:
        continue

      org = Organization(name=district + ' Municipal Council', classification='legislature', jurisdiction_id=self.jurisdiction.jurisdiction_id)
      org.add_source(COUNCIL_PAGE)
      yield org

      p = Legislator(name=name, post_id=district)
      p.add_source(COUNCIL_PAGE)
      membership = p.add_membership(org, role=role, post_id=district)

      info = councillor.xpath('./ancestor::p/text()')
      for contact in info:
        if 'NT' in contact:
          membership.add_contact_detail('address', contact.strip(), 'legislature')
        if 'Tel' in contact:
          contact = contact.replace('Tel. ', '').replace('(', '').replace(') ', '-').strip()
          membership.add_contact_detail('voice', contact, 'legislature')
        if 'Fax' in contact:
          contact = contact.replace('Fax ', '').replace('(', '').replace(') ', '-').strip()
          membership.add_contact_detail('fax', contact, 'legislature')
      email = councillor.xpath('./parent::p//a[contains(@href, "mailto:")]/text()')[0]
      membership.add_contact_detail('email', email, None)

      if 'Website' in councillor.xpath('./parent::p')[0].text_content():
        p.add_link(councillor.xpath('./parent::p//a')[1].attrib['href'], None)
      yield p
コード例 #2
0
    def get_people(self):
        urls = Urls(dict(list=legislators_url), self)

        council = Organization(
            'Temecula City Council',
            classification='legislature')
        council.add_source(urls.list.url)
        yield council

        for tr in urls.list.xpath('//table[2]//tr')[1:]:

            # Parse some attributes.
            name, role = tr.xpath('td/p[1]//font/text()')
            image = tr.xpath('td/img/@src').pop()

            # Create legislator.
            person = Person(name, image=image)

            # Add membership on council.
            memb = person.add_membership(council, role=role)

            # Add email address.
            email, detail_url = tr.xpath('td//a/@href')
            email = email[7:]
            memb.contact_details.append(
                dict(type='email', value=email, note='work'))

            # Add sources.
            person.add_source(urls.list.url)
            person.add_source(detail_url)

            yield person
コード例 #3
0
    def get_people(self):
        urls = Urls(dict(list=legislators_url), self)

        council = Organization('Temecula City Council',
                               classification='legislature')
        council.add_source(urls.list.url)
        yield council

        for tr in urls.list.xpath('//table[2]//tr')[1:]:

            # Parse some attributes.
            name, role = tr.xpath('td/p[1]//font/text()')
            image = tr.xpath('td/img/@src').pop()

            # Create legislator.
            person = Person(name, image=image)

            # Add membership on council.
            memb = person.add_membership(council, role=role)

            # Add email address.
            email, detail_url = tr.xpath('td//a/@href')
            email = email[7:]
            memb.contact_details.append(
                dict(type='email', value=email, note='work'))

            # Add sources.
            person.add_source(urls.list.url)
            person.add_source(detail_url)

            yield person
コード例 #4
0
ファイル: jurisdiction.py プロジェクト: finestjava/pupa
def import_jurisdiction(org_importer, jurisdiction):
    obj = jurisdiction.get_db_object()

    obj['_type'] = 'jurisdiction'
    obj['_id'] = jurisdiction.jurisdiction_id
    obj['latest_update'] = datetime.datetime.utcnow()

    # validate jurisdiction
    validator = DatetimeValidator()
    try:
        validator.validate(obj, jurisdiction_schema)
    except ValueError as ve:
        raise ve

    db.jurisdictions.save(obj)

    # create organization(s) (TODO: if there are multiple chambers this isn't right)
    org = Organization(name=jurisdiction.name, classification='legislature',
                       jurisdiction_id=jurisdiction.jurisdiction_id)
    if jurisdiction.other_names:
        org.other_names = jurisdiction.other_names
    if jurisdiction.parent_id:
        org.parent_id = jurisdiction.parent_id

    org_importer.import_object(org)

    # create parties
    for party in jurisdiction.parties:
        org = Organization(**{'classification': 'party',
                              'name': party['name'],
                              'parent_id': None})
        org_importer.import_object(org)
コード例 #5
0
ファイル: people.py プロジェクト: fchagnon/scrapers-ca
    def get_people(self):
        page = lxmlize(COUNCIL_PAGE)

        districts = page.xpath(
            '//div[@id="ctl00_PublicContent_divSearchContent"]//tr')[5::3]
        for district in districts:
            title = district.xpath('.//td//text()')
            if len(title[0]) > 1:
                title = title[0]
            else:
                title = ''.join(title[:2])

            # @todo Need to distinguish between, e.g., R.M. and Town
            title = title.title()
            organization = Organization(
                name=title + ' Municipal Council',
                classification='legislature',
                jurisdiction_id=self.jurisdiction.jurisdiction_id)
            organization.add_source(COUNCIL_PAGE)
            yield organization

            contact = district.xpath('.//td/text()')
            address = ' '.join(contact[:4])
            address = re.sub(r'(Fax:.*)', '', address).strip()
            contact = [x for x in contact if 'Fax' in x]
            fax = contact[0].split(':')[1].strip()

            phone = district.xpath('.//b[contains(text(), "Phone")]/text()'
                                   )[0].split(':')[1].strip()
            email = district.xpath(
                './/a[contains(@href, "mailto:")]/text()')[0].strip()

            councillors = district.xpath('.//td[3]/text()')
            positions = district.xpath('.//td[2]/b/text()')
            for i, councillor in enumerate(councillors):
                p = Legislator(name=councillor, post_id=title)
                p.add_source(COUNCIL_PAGE)

                if i >= 2:
                    membership = p.add_membership(organization,
                                                  role='Councillor')
                else:
                    membership = p.add_membership(
                        organization, role=positions[i]
                    )  # @todo "Resident Administrator & Chief Administrative Officer" is split on two lines

                membership.post_id = title
                membership.add_contact_detail('address', address,
                                              'legislature')
                membership.add_contact_detail('fax', fax, 'legislature')
                membership.add_contact_detail('voice', phone, 'legislature')
                membership.add_contact_detail('email', email, None)
                yield p
コード例 #6
0
ファイル: test_organization.py プロジェクト: lfalvarez/pupa
def test_add_contact():
    """ test we can add a contact detail to an org """
    orga = Organization("name")
    orga.add_source(url='foo')
    orga.validate()

    orga.add_contact_detail(type='voice', value='555-393-2821', note='nothing')

    orga.validate()
コード例 #7
0
ファイル: people.py プロジェクト: fchagnon/scrapers-ca
  def get_people(self):
    response = urllib2.urlopen(COUNCIL_PAGE).read()
    pdf = open('/tmp/ns.pdf', 'w')
    pdf.write(response)
    pdf.close()

    data = subprocess.check_output(['pdftotext', '/tmp/ns.pdf', '-'])
    emails = re.findall(r'(?<=E-mail: ).+', data)
    data = re.split(r'Mayor |Warden ', data)[1:]
    for i, mayor in enumerate(data):
      lines = mayor.splitlines(True)
      name = lines.pop(0).strip()
      if name == "Jim Smith":
        continue
      district = lines.pop(0).strip()
      if not re.findall(r'[0-9]', lines[0]):
        district = district + ' ' + lines.pop(0).strip()

      org = Organization(name=district + ' Municipal Council', classification='legislature', jurisdiction_id=self.jurisdiction.jurisdiction_id)
      org.add_source(COUNCIL_PAGE)
      yield org

      p = Legislator(name=name, post_id=district)
      p.add_source(COUNCIL_PAGE)
      membership = p.add_membership(org, role='Mayor', post_id=district)

      address = lines.pop(0).strip() + ', ' + lines.pop(0).strip()
      if not 'Phone' in lines[0]:
        address = address + ', ' + lines.pop(0).strip()

      if not 'Phone' in lines[0]:
        address = address + ', ' + lines.pop(0).strip()

      phone = lines.pop(0).split(':')[1].strip()
      if 'Fax' in lines.pop(0):
        fax = lines.pop(0)

      membership.add_contact_detail('address', address, 'legislature')
      membership.add_contact_detail('voice', phone, 'legislature')
      membership.add_contact_detail('fax', fax, 'legislature')
      # @todo emails are being assigned incorrectly, e.g. Town of Berwick picks
      # up Cape Breton Regional Municipality and Region of Queens Municipality
      for i, email in enumerate(emails):
        regex = name.split()[-1].lower() + '|' + '|'.join(district.split()[-2:]).replace('of', '').lower()
        regex = regex.replace('||', '|')
        matches = re.findall(r'%s' % regex, email)
        if matches:
          membership.add_contact_detail('email', emails.pop(i), None)
      yield p

    os.system('rm /tmp/ns.pdf')
コード例 #8
0
ファイル: people.py プロジェクト: fchagnon/scrapers-ca
  def get_people(self):
    page = lxmlize(COUNCIL_PAGE)
    types = page.xpath('//div[@class="bluearrow shaded bottomborder "][1]/ul/li/a/@href')[:4]
    for org_type, link in enumerate(types):
      page = lxmlize(link)
      district_urls = page.xpath('//div[@class="parbase list section cplist"]/table/tr/td[1]/b/a/@href')
      for district_url in district_urls:
        page = lxmlize(district_url)
        district = page.xpath('//div[@class="pageHeader"]/h1/text()')[0].split(' - ')[1].strip()

        org = Organization(name=district + org_types[org_type], classification='legislature', jurisdiction_id=self.jurisdiction.jurisdiction_id)
        org.add_source(district_url)
        yield org

        address = ', '.join(page.xpath('//div[@class="left_contents"]/p[1]/text()'))
        contacts = page.xpath('//div[@class="left_contents"]/p[b[text() = "Contact"]]/text()')
        phone = contacts[0].split(':')[1].strip().replace(' ', '-')
        fax = contacts[1].split(':')[1].strip().replace(' ', '-')
        email = page.xpath('//div[@class="left_contents"]//a[contains(@href, "mailto:")]')
        if email:
          email = email[0].text_content()

        site = page.xpath('//div[@class="left_contents"]//a[not(contains(@href,"mailto:"))]')
        if site:
          site = site[0].text_content()

        councillors = page.xpath('//div[@class="right_contents"]//p/text()')
        for i, councillor in enumerate(councillors):
          if 'Vacant' in councillor:
            continue
          p = Legislator(name=councillor, post_id=district)
          p.add_source(COUNCIL_PAGE)
          p.add_source(link)
          p.add_source(district_url)

          if i == 0:
            membership = p.add_membership(org, role='Mayor')
          else:
            membership = p.add_membership(org, role='Councillor')

          membership.post_id = district
          membership.add_contact_detail('address', address, 'legislature')
          if phone:
            membership.add_contact_detail('voice', phone, 'legislature')
          if fax:
            membership.add_contact_detail('fax', fax, 'legislature')
          if email:
            membership.add_contact_detail('email', email, None)
          if site:
            p.add_link(site, None)
          yield p
コード例 #9
0
    def get_people(self):
        page = lxmlize(COUNCIL_PAGE)

        councillors = page.xpath('//div[@class="entry-content"]//p/strong')
        for councillor in councillors:
            district = councillor.xpath('./ancestor::p/preceding-sibling::h2'
                                        )[-1].text_content().split(
                                            '–'.decode('utf-8'))[0]
            name = ' '.join(councillor.text_content().split()[-2:]).replace(
                '-Â'.decode('utf-8'), '')
            role = councillor.text_content().replace(name, '').split('-')[0]
            if 'SAO' in role or not role:
                continue

            org = Organization(
                name=district + ' Municipal Council',
                classification='legislature',
                jurisdiction_id=self.jurisdiction.jurisdiction_id)
            org.add_source(COUNCIL_PAGE)
            yield org

            p = Legislator(name=name, post_id=district)
            p.add_source(COUNCIL_PAGE)
            membership = p.add_membership(org, role=role, post_id=district)

            info = councillor.xpath('./ancestor::p/text()')
            for contact in info:
                if 'NT' in contact:
                    membership.add_contact_detail('address', contact.strip(),
                                                  'legislature')
                if 'Tel' in contact:
                    contact = contact.replace('Tel. ',
                                              '').replace('(', '').replace(
                                                  ') ', '-').strip()
                    membership.add_contact_detail('voice', contact,
                                                  'legislature')
                if 'Fax' in contact:
                    contact = contact.replace('Fax ',
                                              '').replace('(', '').replace(
                                                  ') ', '-').strip()
                    membership.add_contact_detail('fax', contact,
                                                  'legislature')
            email = councillor.xpath(
                './parent::p//a[contains(@href, "mailto:")]/text()')[0]
            membership.add_contact_detail('email', email, None)

            if 'Website' in councillor.xpath('./parent::p')[0].text_content():
                p.add_link(
                    councillor.xpath('./parent::p//a')[1].attrib['href'], None)
            yield p
コード例 #10
0
  def get_people(self):
    page = lxmlize(COUNCIL_PAGE)

    districts = page.xpath('//div[@id="left-content" or @id="right-content"]//a')
    for district in districts:
      url = district.attrib['href']
      page = lxmlize(url)

      org = Organization(name=district.text_content() + ' Council', classification='legislature', jurisdiction_id=self.jurisdiction.jurisdiction_id)
      org.add_source(url)
      yield org

      info = page.xpath('//div[@style="WIDTH:750"]/dl')
      for contact in info:
        contact_type = contact.xpath('./dt')[0].text_content()
        contact = contact.xpath('./dd')[0].text_content().replace('(', '').replace(') ', '-')
        if 'Officials' in contact_type:
          break
        if 'Tel' in contact_type:
          phone = contact
        if 'Fac' in contact_type:
          fax = contact
        if 'Address' in contact_type:
          address = contact
        if 'Email' in contact_type:
          email = contact
        if 'Website' in contact_type:
          site = contact

      councillors = page.xpath('//div[@style="WIDTH:750"]/dl/dt[contains(text(), "Elected Officials")]/parent::dl/dd/pre/text()')[0].splitlines(True)
      for councillor in councillors:
        name = councillor.replace('(Mayor)', '').replace('(Deputy Mayor)', '').replace('(Chairperson)', '').strip()
        role = re.sub(r'\(|\)', '', councillor.replace(name, '').strip())
        if not role:
          role = 'Councillor'
        p = Legislator(name=name, post_id=district.text_content())
        p.add_source(COUNCIL_PAGE)
        p.add_source(url)
        membership = p.add_membership(org, role=role, post_id=district.text_content())
        membership.add_contact_detail('voice', clean_telephone_number(phone), 'legislature')
        membership.add_contact_detail('fax', clean_telephone_number(fax), 'legislature')
        membership.add_contact_detail('address', clean_address(address), 'legislature')
        membership.add_contact_detail('email', email, None)
        if site:
          p.add_link(site, None)
        yield p
コード例 #11
0
ファイル: test_organization.py プロジェクト: lfalvarez/pupa
def test_basic_invalid_organization():
    """ Make sure we can make an invalid orga """
    orga = Organization("name")
    orga.add_source(url='foo')
    orga.validate()

    orga.name = None

    with assert_raises(ValidationError):
        orga.validate()
コード例 #12
0
ファイル: jurisdiction.py プロジェクト: lfalvarez/pupa
def import_jurisdiction(org_importer, jurisdiction):
    obj = jurisdiction.get_db_object()

    obj['_type'] = 'jurisdiction'
    obj['_id'] = jurisdiction.jurisdiction_id

    if not obj['_id'].startswith("ocd-jurisdiction/"):
        raise ValueError("The Jurisdiction appears to have an ID that does not"
                         " begin with 'ocd-jurisdiction'. I found '%s'" % (
                             jurisdiction.jurisdiction_id))

    obj['latest_update'] = datetime.datetime.utcnow()

    # validate jurisdiction
    validator = DatetimeValidator()
    try:
        validator.validate(obj, jurisdiction_schema)
    except ValueError as ve:
        raise ve

    db.jurisdictions.save(obj)

    # create organization(s)
    org = Organization(name=jurisdiction.name, classification='legislature',
                       jurisdiction_id=jurisdiction.jurisdiction_id)
    if jurisdiction.other_names:
        org.other_names = jurisdiction.other_names
    if jurisdiction.parent_id:
        org.parent_id = jurisdiction.parent_id

    parent_id = org_importer.import_object(org)

    if jurisdiction.chambers:
        for chamber, properties in jurisdiction.chambers.items():
            org = Organization(name=properties['name'], classification='legislature',
                               chamber=chamber, parent_id=parent_id,
                               jurisdiction_id=jurisdiction.jurisdiction_id)
            org_importer.import_object(org)

    # create parties
    for party in jurisdiction.parties:
        org = Organization(**{'classification': 'party',
                              'name': party['name'],
                              'parent_id': None})
        org_importer.import_object(org)
コード例 #13
0
ファイル: people.py プロジェクト: fchagnon/scrapers-ca
  def get_people(self):
    page = lxmlize(COUNCIL_PAGE)

    districts = page.xpath('//div[@id="ctl00_PublicContent_divSearchContent"]//tr')[5::3]
    for district in districts:
      title = district.xpath('.//td//text()')
      if len(title[0]) > 1:
        title = title[0]
      else:
        title = ''.join(title[:2])

      # @todo Need to distinguish between, e.g., R.M. and Town
      title = title.title()
      organization = Organization(name=title + ' Municipal Council', classification='legislature', jurisdiction_id=self.jurisdiction.jurisdiction_id)
      organization.add_source(COUNCIL_PAGE)
      yield organization

      contact = district.xpath('.//td/text()')
      address = ' '.join(contact[:4])
      address = re.sub(r'(Fax:.*)', '', address).strip()
      contact = [x for x in contact if 'Fax' in x]
      fax = contact[0].split(':')[1].strip()

      phone = district.xpath('.//b[contains(text(), "Phone")]/text()')[0].split(':')[1].strip()
      email = district.xpath('.//a[contains(@href, "mailto:")]/text()')[0].strip()

      councillors = district.xpath('.//td[3]/text()')
      positions = district.xpath('.//td[2]/b/text()')
      for i, councillor in enumerate(councillors):
        p = Legislator(name=councillor, post_id=title)
        p.add_source(COUNCIL_PAGE)

        if i >= 2:
          membership = p.add_membership(organization, role='Councillor')
        else:
          membership = p.add_membership(organization, role=positions[i])  # @todo "Resident Administrator & Chief Administrative Officer" is split on two lines

        membership.post_id = title
        membership.add_contact_detail('address', address, 'legislature')
        membership.add_contact_detail('fax', fax, 'legislature')
        membership.add_contact_detail('voice', phone, 'legislature')
        membership.add_contact_detail('email', email, None)
        yield p
コード例 #14
0
    def get_people(self):
        urls = Urls(dict(list=legislators_url), self)

        council = Organization('Boise City Council')
        council.add_source(legislators_url)
        yield council

        xpath = '//div[@id="content"]/div/a/@href'
        people_urls = urls.list.xpath(xpath)

        # SKip the mayor because his page has no name or email.
        people_urls = people_urls[1:]
        for url in people_urls:

            urls.add(detail=url)
            # Parse some attributes.

            image = urls.detail.xpath('//div[@id="content"]/p/img/@src').pop()
            name = urls.detail.xpath('//h1/text()').pop()

            name = name.replace('Council ', '')
            role, _, name = name.partition(' ')

            # Create legislator.
            person = Person(name, image=image)

            # Add membership on council.
            memb = person.add_membership(council, role=role)
            memb.add_source(urls.detail.url)

            # Add email address.
            email_xpath = '//a[contains(@href, "mailto")]/@href'
            email = urls.detail.xpath(email_xpath).pop()[7:]
            memb.contact_details.append(
                dict(type='email', value=email, note='work'))

            # Add sources.
            person.add_source(urls.list.url)
            person.add_source(urls.detail.url)

            yield person
コード例 #15
0
    def get_people(self):
        # committee
        tech = Organization('Technology')
        tech.add_post('Chairman', 'chairman')
        tech.add_source('https://example.com')
        yield tech

        # subcommittee
        ecom = Organization('Subcommittee on E-Commerce', parent=tech)
        ecom.add_source('https://example.com')
        yield ecom

        p = Person('Paul Tagliamonte', district='6', chamber='upper')
        p.add_membership(tech, role='chairman')
        p.add_source('https://example.com')
        yield p
コード例 #16
0
ファイル: import_billy.py プロジェクト: finestjava/pupa
    def migrate_legislatures(self, state):
        spec = {}
        if state:
            spec['_id'] = state

        for metad in self.billy_db.metadata.find(spec, timeout=False):
            abbr = metad['abbreviation']
            geoid = "ocd-division/country:us/state:%s" % (abbr)
            for chamber in metad['chambers']:
                cn = metad['chambers'][chamber]['name']
                cow = Organization("%s, %s" % (metad['legislature_name'], cn),
                                   classification="legislature",
                                   chamber=chamber,
                                   division_id=geoid,
                                   abbreviation=abbr)
                cow._openstates_id = "%s-%s" % (abbr, chamber)
                cow.add_source(metad['legislature_url'])

                for post in self.billy_db.districts.find({"abbr": abbr}):
                    if post['chamber'] != chamber:
                        continue

                    cow.add_post(label="Member", role="member", num_seats=post['num_seats'],
                                 id=post['name'])

                self.save_object(cow)

            meta = self.billy_db.metadata.find_one({"_id": cow.abbreviation})
            if meta is None:
                raise Exception
            meta.pop("_id")
            meta['_id'] = cow.jurisdiction_id

            for badtag in ["latest_json_url", "latest_json_date",
                           "latest_csv_url", "latest_csv_date"]:
                meta.pop(badtag, None)

            meta['division_id'] = "ocd-division/country:us/state:%s" % (
                cow.abbreviation
            )

            db.jurisdictions.save(meta)
コード例 #17
0
    def get_people(self):
        page = lxmlize(COUNCIL_PAGE)
        types = page.xpath(
            '//div[@class="bluearrow shaded bottomborder "][1]/ul/li/a/@href'
        )[:4]
        for org_type, link in enumerate(types):
            page = lxmlize(link)
            district_urls = page.xpath(
                '//div[@class="parbase list section cplist"]/table/tr/td[1]/b/a/@href'
            )
            for district_url in district_urls:
                page = lxmlize(district_url)
                district = page.xpath('//div[@class="pageHeader"]/h1/text()'
                                      )[0].split(' - ')[1].strip()

                org = Organization(
                    name=district + org_types[org_type],
                    classification='legislature',
                    jurisdiction_id=self.jurisdiction.jurisdiction_id)
                org.add_source(district_url)
                yield org

                address = ', '.join(
                    page.xpath('//div[@class="left_contents"]/p[1]/text()'))
                contacts = page.xpath(
                    '//div[@class="left_contents"]/p[b[text() = "Contact"]]/text()'
                )
                phone = contacts[0].split(':')[1].strip().replace(' ', '-')
                fax = contacts[1].split(':')[1].strip().replace(' ', '-')
                email = page.xpath(
                    '//div[@class="left_contents"]//a[contains(@href, "mailto:")]'
                )
                if email:
                    email = email[0].text_content()

                site = page.xpath(
                    '//div[@class="left_contents"]//a[not(contains(@href,"mailto:"))]'
                )
                if site:
                    site = site[0].text_content()

                councillors = page.xpath(
                    '//div[@class="right_contents"]//p/text()')
                for i, councillor in enumerate(councillors):
                    if 'Vacant' in councillor:
                        continue
                    p = Legislator(name=councillor, post_id=district)
                    p.add_source(COUNCIL_PAGE)
                    p.add_source(link)
                    p.add_source(district_url)

                    if i == 0:
                        membership = p.add_membership(org, role='Mayor')
                    else:
                        membership = p.add_membership(org, role='Councillor')

                    membership.post_id = district
                    membership.add_contact_detail('address', address,
                                                  'legislature')
                    if phone:
                        membership.add_contact_detail('voice', phone,
                                                      'legislature')
                    if fax:
                        membership.add_contact_detail('fax', fax,
                                                      'legislature')
                    if email:
                        membership.add_contact_detail('email', email, None)
                    if site:
                        p.add_link(site, None)
                    yield p
コード例 #18
0
ファイル: people.py プロジェクト: fchagnon/scrapers-ca
  def get_people(self):
    response = urllib2.urlopen(COUNCIL_PAGE).read()
    pdf = open('/tmp/sk.pdf', 'w')
    pdf.write(response)
    pdf.close()

    data = subprocess.check_output(['pdftotext', '-layout', '/tmp/sk.pdf', '-'])

    data = data.splitlines(True)
    pages = []
    page = []
    for line in data:
      if line.strip() and not 'Page' in line and not 'CITIES' in line and not 'NORTHERN TOWNS, VILLAGES' in line:
        page.append(line)
      elif page:
        pages.append(page)
        page = []

    districts = []
    for page in pages:
      index = re.search(r'(\s{6,})', page[0])
      if index:
        index = index.end() - 1
      else:
        index = -1
      dist1 = []
      dist2 = []
      for line in page:
        dist1.append(line[:index].strip())
        dist2.append(line[index:].strip())
      districts.append(dist1)
      districts.append(dist2)

    for district in districts:

      district_name = district.pop(0).split(',')[0].title()

      org = Organization(name=district_name + ' Council', classification='legislature', jurisdiction_id=self.jurisdiction.jurisdiction_id)
      org.add_source(COUNCIL_PAGE)

      councillors = []
      contacts = {}
      for i, line in enumerate(district):
        if 'Phone' in line:
          phone = line.split(':')[1].replace('(', '').replace(') ', '-').strip()
          if phone:
            contacts['voice'] = phone
        if 'Fax' in line:
          fax = line.split(':')[1].replace('(', '').replace(') ', '-').strip()
          if fax:
            contacts['fax'] = fax
        if 'E-Mail' in line:
          email = line.split(':')[1].strip()
          if email:
            contacts['email'] = email
        if 'Address' in line and line.split(':')[1].strip():
          address = line.split(':')[1].strip() + ', ' + ', '.join(district[i + 1:]).replace(' ,', '')
          contacts['address'] = address
        if 'Mayor' in line or 'Councillor' in line or 'Alderman' in line:
          councillor = line.split(':')[1].replace('Mr.', '').replace('Mrs.', '').replace('Ms.', '').replace('His Worship', '').replace('Her Worship', '').strip()
          role = line.split(':')[0].strip()
          if councillor:
            councillors.append([councillor, role])

      if not councillors:
        continue
      yield org
      for councillor in councillors:
        p = Legislator(name=councillor[0], post_id=district_name)
        p.add_source(COUNCIL_PAGE)
        membership = p.add_membership(org, role=councillor[1], post_id=district_name)

        for key, value in contacts.iteritems():
          membership.add_contact_detail(key, value, None if key == 'email' else 'legislature')
        yield p
    os.system('rm /tmp/sk.pdf')
コード例 #19
0
ファイル: people.py プロジェクト: fchagnon/scrapers-ca
    def get_people(self):
        page = lxmlize(COUNCIL_PAGE)
        url = page.xpath(
            '//a[contains(text(),"Municipal Directory")]/@href')[0]

        response = urllib2.urlopen(url).read()
        pdf = open('/tmp/nl.pdf', 'w')
        pdf.write(response)
        pdf.close()

        data = subprocess.check_output(
            ['pdftotext', '-layout', '/tmp/nl.pdf', '-'])
        pages = data.split('Municipal Directory')[1:]
        for page in pages:
            page = page.splitlines(True)
            column_index = {}
            for line in page:
                if 'Official Name' in line:
                    column_index['dist_end'] = re.search('Region',
                                                         line).start()
                    column_index['name_start'] = re.search('Mayor',
                                                           line).start() + 1
                    column_index['name_end'] = re.search('Clerk',
                                                         line).start() - 1
                    column_index['phone_start'] = re.search('Line 1',
                                                            line).start()
                    column_index['phone_end'] = re.search('Line 2',
                                                          line).start() - 1
                    column_index['fax_start'] = re.search('Fax', line).start()
                    column_index['fax_end'] = re.search('E-mail',
                                                        line).start() - 2
                    column_index['email_start'] = column_index['fax_end'] + 1
                    column_index['email_end'] = re.search('Address',
                                                          line).start() - 1
                    column_index[
                        'address_start'] = column_index['email_end'] + 1
                    column_index['address_end'] = re.search('Days',
                                                            line).start() - 1
                    break
            for line in page:
                if 'Official Name' in line or not line.strip():
                    continue
                district = line[:column_index['dist_end']]
                name = line[column_index['name_start']:
                            column_index['name_end']].strip()
                phone = line[column_index['phone_start']:
                             column_index['phone_end']].strip().replace(
                                 '(', '').replace(') ', '-')
                fax = line[column_index['fax_start']:
                           column_index['fax_end']].strip().replace(
                               '(', '').replace(') ', '-')
                email = line[column_index['email_start']:
                             column_index['email_end']].strip()
                address = line[column_index['address_start']:
                               column_index['address_end']].strip()
                address = re.sub(r'\s{2,}', ', ', address)
                if not name or not district:
                    continue

                org = Organization(
                    name=district + ' Municipal Council',
                    classification='legislature',
                    jurisdiction_id=self.jurisdiction.jurisdiction_id)
                org.add_source(COUNCIL_PAGE)
                org.add_source(url)
                yield org

                p = Legislator(name=name, post_id=district)
                p.add_source(COUNCIL_PAGE)
                p.add_source(url)
                membership = p.add_membership(org,
                                              role='Mayor',
                                              post_id=district)
                if phone:
                    membership.add_contact_detail('voice', phone,
                                                  'legislature')
                # Im excluding fax because that column isn't properly aligned
                # if fax:
                #   membership.add_contact_detail('fax', fax, None)
                if email:
                    membership.add_contact_detail('email', email, None)
                if address:
                    membership.add_contact_detail('address', address,
                                                  'legislature')
                yield p
        os.system('rm /tmp/nl.pdf')
コード例 #20
0
ファイル: people.py プロジェクト: fchagnon/scrapers-ca
  def get_people(self):
    response = urllib2.urlopen(COUNCIL_PAGE).read()
    pdf = open('/tmp/yt.pdf', 'w')
    pdf.write(response)
    pdf.close()

    data = subprocess.check_output(['pdftotext', '-layout', '/tmp/yt.pdf', '-'])
    data = re.split(r'\n\s*\n', data)
    for municipality in data:

      if not 'Councillors' in municipality:
        continue
      lines = municipality.split('\n')
      if 'Page' in lines[0]:
        lines.pop(0)
        if not lines[0].strip():
          lines.pop(0)
      col1end = re.search(r'\s{2,}(\w)', lines[0].strip()).end()
      col2end = re.search(r':\s{2,}(\w)', lines[0].strip()).end()

      if 'Council' in lines[1]:
        address = lines[2][:col1end - 1].strip() + ' ' + lines[3][:col1end - 1].strip()
        district = lines[0][:col1end - 1].strip() + ' ' + lines[1][:col1end - 1].strip()
      else:
        address = lines[1][:col1end - 1].strip() + ' ' + lines[2][:col1end - 1].strip()
        district = lines[0][:col1end - 1].strip()

      organization = Organization(name=district + ' Council', classification='legislature', jurisdiction_id=self.jurisdiction.jurisdiction_id)
      organization.add_source(COUNCIL_PAGE)
      yield organization

      phone = re.findall(r'(?<=Phone: )\(?(\d{3}[\)-] ?\d{3}-\d{4})', municipality)[0].replace(') ', '-')
      email = re.findall(r'(?<=E-mail:) (\S*)', municipality)[0]
      fax = None
      if 'Fax' in municipality:
        fax = re.findall(r'(?<=Fax: )\(?(\d{3}[\)-] ?\d{3}-\d{4})', municipality)[0].replace(') ', '-')
      website = None
      if 'Website' in municipality:
        website = re.findall(r'((http:\/\/|www.)(\S*))', municipality)[0][0]

      councillor_or_mayor = False
      for line in lines:
        if 'Mayor:' in line:
          councillor_or_mayor = True
          role = 'Mayor'
          continue
        if 'Councillors' in line:
          councillor_or_mayor = True
          role = 'Councillor'
          continue
        if councillor_or_mayor:
          councillor = line[col1end - 1:col2end - 1].strip()
          if not councillor:
            continue
          p = Legislator(name=councillor, post_id=district)
          p.add_source(COUNCIL_PAGE)
          membership = p.add_membership(organization, role=role, post_id=district)
          membership.add_contact_detail('address', address, 'legislature')
          membership.add_contact_detail('voice', phone, 'legislature')
          membership.add_contact_detail('email', email, None)
          if fax:
            membership.add_contact_detail('fax', fax, 'legislature')
          if website:
            p.add_link(website, None)
          yield p

    os.system('rm /tmp/yt.pdf')
コード例 #21
0
ファイル: people.py プロジェクト: fchagnon/scrapers-ca
    def get_people(self):
        response = urllib2.urlopen(COUNCIL_PAGE).read()
        pdf = open('/tmp/sk.pdf', 'w')
        pdf.write(response)
        pdf.close()

        data = subprocess.check_output(
            ['pdftotext', '-layout', '/tmp/sk.pdf', '-'])

        data = data.splitlines(True)
        pages = []
        page = []
        for line in data:
            if line.strip(
            ) and not 'Page' in line and not 'CITIES' in line and not 'NORTHERN TOWNS, VILLAGES' in line:
                page.append(line)
            elif page:
                pages.append(page)
                page = []

        districts = []
        for page in pages:
            index = re.search(r'(\s{6,})', page[0])
            if index:
                index = index.end() - 1
            else:
                index = -1
            dist1 = []
            dist2 = []
            for line in page:
                dist1.append(line[:index].strip())
                dist2.append(line[index:].strip())
            districts.append(dist1)
            districts.append(dist2)

        for district in districts:

            district_name = district.pop(0).split(',')[0].title()

            org = Organization(
                name=district_name + ' Council',
                classification='legislature',
                jurisdiction_id=self.jurisdiction.jurisdiction_id)
            org.add_source(COUNCIL_PAGE)

            councillors = []
            contacts = {}
            for i, line in enumerate(district):
                if 'Phone' in line:
                    phone = line.split(':')[1].replace('(', '').replace(
                        ') ', '-').strip()
                    if phone:
                        contacts['voice'] = phone
                if 'Fax' in line:
                    fax = line.split(':')[1].replace('(',
                                                     '').replace(') ',
                                                                 '-').strip()
                    if fax:
                        contacts['fax'] = fax
                if 'E-Mail' in line:
                    email = line.split(':')[1].strip()
                    if email:
                        contacts['email'] = email
                if 'Address' in line and line.split(':')[1].strip():
                    address = line.split(':')[1].strip() + ', ' + ', '.join(
                        district[i + 1:]).replace(' ,', '')
                    contacts['address'] = address
                if 'Mayor' in line or 'Councillor' in line or 'Alderman' in line:
                    councillor = line.split(':')[1].replace('Mr.', '').replace(
                        'Mrs.',
                        '').replace('Ms.',
                                    '').replace('His Worship',
                                                '').replace('Her Worship',
                                                            '').strip()
                    role = line.split(':')[0].strip()
                    if councillor:
                        councillors.append([councillor, role])

            if not councillors:
                continue
            yield org
            for councillor in councillors:
                p = Legislator(name=councillor[0], post_id=district_name)
                p.add_source(COUNCIL_PAGE)
                membership = p.add_membership(org,
                                              role=councillor[1],
                                              post_id=district_name)

                for key, value in contacts.iteritems():
                    membership.add_contact_detail(
                        key, value, None if key == 'email' else 'legislature')
                yield p
        os.system('rm /tmp/sk.pdf')
コード例 #22
0
    def get_people(self):
        urls = Urls(dict(list=legislators_url), self)

        council = Organization('Denver City Council')
        council.add_source(legislators_url)

        # Get image urls, names, detail urls, and districts.
        image_xpath = '//a[contains(@href, "councildistrict")]/img/@src'
        image_urls = urls.list.xpath(image_xpath)

        name_xpath = '//a[contains(@href, "councildistrict")]'
        names = [a.text_content() for a in urls.list.xpath(name_xpath)][:-1]
        names = filter(None, names)

        person_urls_xpath = '//a[contains(@href, "councildistrict")]/@href'
        person_urls = urls.list.xpath(person_urls_xpath)

        post_ids = []
        xpath = '//a[contains(@href, "councildistrict")]/img/ancestor::td'
        for td in urls.list.xpath(xpath):
            text = td.text_content()
            m = re.search('Council District \d+', text)
            if m:
                post_ids.append(m.group())
                continue
            m = re.search('Council At-Large', text)
            if m:
                post_ids.append('Council At-Large')

        for post_id in post_ids:
            council.add_post(post_id, post_id)
        yield council

        data = zip(image_urls, names, person_urls, post_ids)
        for image_url, name, person_url, post_id in data:

            # Create legislator.
            person = Person(name, image=image_url)

            # Add sources.
            urls.add(detail=person_url)
            person.add_source(urls.list.url, note='list')
            person.add_source(urls.detail.url, note='detail')

            # Add membership on council.
            memb = person.add_membership(council, post_id=post_id.strip())
            memb.add_source(urls.detail.url)

            xpath = '//div[@id="dnn_column3"]'
            contact_text = urls.detail.xpath(xpath)[0].text_content()

            if not contact_text.strip():
                xpath = '//div[contains(@id, "dnn_RightPaneWide")]'
                contact_text = urls.detail.xpath(xpath)[0].text_content()

            phone_regex = r'\(\d{3}\)[ -]*\d{3}-\d{4}'
            phone = re.search(phone_regex, contact_text).group()
            memb.contact_details.append(
                dict(type='phone', value=phone, note='work'))

            # Add email address.
            email_regex = r'\[email protected]'
            email = re.search(email_regex, contact_text).group()
            memb.contact_details.append(
                dict(type='email', value=email, note='work'))

            yield person
コード例 #23
0
    def get_people(self):
        urls = Urls(dict(list=legislators_url), self)

        council = Organization('Denver City Council')
        council.add_source(legislators_url)

        # Get image urls, names, detail urls, and districts.
        image_xpath = '//a[contains(@href, "councildistrict")]/img/@src'
        image_urls = urls.list.xpath(image_xpath)

        name_xpath = '//a[contains(@href, "councildistrict")]'
        names = [a.text_content() for a in urls.list.xpath(name_xpath)][:-1]
        names = filter(None, names)

        person_urls_xpath = '//a[contains(@href, "councildistrict")]/@href'
        person_urls = urls.list.xpath(person_urls_xpath)

        post_ids = []
        xpath = '//a[contains(@href, "councildistrict")]/img/ancestor::td'
        for td in urls.list.xpath(xpath):
            text = td.text_content()
            m = re.search('Council District \d+', text)
            if m:
                post_ids.append(m.group())
                continue
            m = re.search('Council At-Large', text)
            if m:
                post_ids.append('Council At-Large')

        for post_id in post_ids:
            council.add_post(post_id, post_id)
        yield council

        data = zip(image_urls, names, person_urls, post_ids)
        for image_url, name, person_url, post_id in data:

            # Create legislator.
            person = Person(name, image=image_url)

            # Add sources.
            urls.add(detail=person_url)
            person.add_source(urls.list.url, note='list')
            person.add_source(urls.detail.url, note='detail')

            # Add membership on council.
            memb = person.add_membership(council, post_id=post_id.strip())
            memb.add_source(urls.detail.url)

            xpath = '//div[@id="dnn_column3"]'
            contact_text = urls.detail.xpath(xpath)[0].text_content()

            if not contact_text.strip():
                xpath = '//div[contains(@id, "dnn_RightPaneWide")]'
                contact_text = urls.detail.xpath(xpath)[0].text_content()

            phone_regex = r'\(\d{3}\)[ -]*\d{3}-\d{4}'
            phone = re.search(phone_regex, contact_text).group()
            memb.contact_details.append(
                dict(type='phone', value=phone, note='work'))

            # Add email address.
            email_regex = r'\[email protected]'
            email = re.search(email_regex, contact_text).group()
            memb.contact_details.append(
                dict(type='email', value=email, note='work'))

            yield person
コード例 #24
0
ファイル: test_organization.py プロジェクト: lfalvarez/pupa
def test_add_post():
    """ Test that we can hack posts in on the fly'"""
    orga = Organization("name")
    orga.add_source(url='foo')
    orga.validate()

    orga.add_post("Human Readable Name", "Chef")

    assert orga.posts[0]['role'] == "Chef"
    assert orga.posts[0]['label'] == "Human Readable Name"

    with assert_raises(TypeError):
        orga.add_identifier("id10t", foo="bar")

    orga.add_identifier("id10t")
    orga.add_identifier("l0l", scheme="kruft")

    assert orga.identifiers[-1]['scheme'] == "kruft"
    assert orga.identifiers[0]['identifier'] == "id10t"
    assert not hasattr(orga.identifiers[0], "scheme")
コード例 #25
0
ファイル: import_billy.py プロジェクト: finestjava/pupa
    def migrate_committees(self, state):

        def attach_members(committee, org):
            term = get_current_term(obj_to_jid(org))
            for member in committee['members']:
                osid = member.get('leg_id', None)
                person_id = lookup_entry_id('people', osid)
                if person_id:
                    m = Membership(person_id, org._id,
                                   role=member['role'],
                                   chamber=org.chamber,
                                   # term=term['name'],
                                   start_date=str(term['start_year']))
                    m.add_extra('term', term['name'])
                    # We can assume there's no end_year because it's a current
                    # member of the committee. If they left the committee, we don't
                    # know about it yet :)
                    self.save_object(m)

                    if m.role != 'member':
                        # In addition to being the (chair|vice-chair),
                        # they should also be noted as a member.
                        m = Membership(person_id, org._id,
                                       role='member',
                                       chamber=org.chamber,
                                       start_date=str(term['start_year']))
                        m.add_extra('term', term['name'])
                        self.save_object(m)

        spec = {"subcommittee": None}

        if state:
            spec['state'] = state

        for committee in self.billy_db.committees.find(spec, timeout=False):
            # OK, we need to do the root committees first, so that we have IDs that
            # we can latch onto down below.
            org = Organization(committee['committee'],
                               classification="committee")
            org.chamber = committee['chamber']
            org.parent_id = lookup_entry_id('organizations', committee['state'])
            org.identifiers = [{'scheme': 'openstates',
                                'identifier': committee['_id']}]
            org._openstates_id = committee['_id']
            org.sources = committee['sources']
            org.created_at = committee['created_at']
            org.updated_at = committee['updated_at']
            # Look into posts; but we can't be sure.
            self.save_object(org)
            attach_members(committee, org)

        spec.update({"subcommittee": {"$ne": None}})

        for committee in self.billy_db.committees.find(spec, timeout=False):
            org = Organization(committee['subcommittee'],
                               classification="committee")

            org.parent_id = lookup_entry_id(
                'organizations',
                committee['parent_id']
            ) or lookup_entry_id(
                'organizations',
                committee['state']
            )

            org.identifiers = [{'scheme': 'openstates',
                               'identifier': committee['_id']}]
            org._openstates_id = committee['_id']
            org.sources = committee['sources']
            org.chamber = committee['chamber']
            # Look into posts; but we can't be sure.
            self.save_object(org)
            attach_members(committee, org)
コード例 #26
0
ファイル: people.py プロジェクト: fchagnon/scrapers-ca
  def get_people(self):
    page = lxmlize(COUNCIL_PAGE)
    url = page.xpath('//a[contains(text(),"Municipal Directory")]/@href')[0]

    response = urllib2.urlopen(url).read()
    pdf = open('/tmp/nl.pdf', 'w')
    pdf.write(response)
    pdf.close()

    data = subprocess.check_output(['pdftotext', '-layout', '/tmp/nl.pdf', '-'])
    pages = data.split('Municipal Directory')[1:]
    for page in pages:
      page = page.splitlines(True)
      column_index = {}
      for line in page:
        if 'Official Name' in line:
          column_index['dist_end'] = re.search('Region', line).start()
          column_index['name_start'] = re.search('Mayor', line).start() + 1
          column_index['name_end'] = re.search('Clerk', line).start() - 1
          column_index['phone_start'] = re.search('Line 1', line).start()
          column_index['phone_end'] = re.search('Line 2', line).start() - 1
          column_index['fax_start'] = re.search('Fax', line).start()
          column_index['fax_end'] = re.search('E-mail', line).start() - 2
          column_index['email_start'] = column_index['fax_end'] + 1
          column_index['email_end'] = re.search('Address', line).start() - 1
          column_index['address_start'] = column_index['email_end'] + 1
          column_index['address_end'] = re.search('Days', line).start() - 1
          break
      for line in page:
        if 'Official Name' in line or not line.strip():
          continue
        district = line[:column_index['dist_end']]
        name = line[column_index['name_start']:column_index['name_end']].strip()
        phone = line[column_index['phone_start']:column_index['phone_end']].strip().replace('(', '').replace(') ', '-')
        fax = line[column_index['fax_start']:column_index['fax_end']].strip().replace('(', '').replace(') ', '-')
        email = line[column_index['email_start']:column_index['email_end']].strip()
        address = line[column_index['address_start']:column_index['address_end']].strip()
        address = re.sub(r'\s{2,}', ', ', address)
        if not name or not district:
          continue

        org = Organization(name=district + ' Municipal Council', classification='legislature', jurisdiction_id=self.jurisdiction.jurisdiction_id)
        org.add_source(COUNCIL_PAGE)
        org.add_source(url)
        yield org

        p = Legislator(name=name, post_id=district)
        p.add_source(COUNCIL_PAGE)
        p.add_source(url)
        membership = p.add_membership(org, role='Mayor', post_id=district)
        if phone:
          membership.add_contact_detail('voice', phone, 'legislature')
        # Im excluding fax because that column isn't properly aligned
        # if fax:
        #   membership.add_contact_detail('fax', fax, None)
        if email:
          membership.add_contact_detail('email', email, None)
        if address:
          membership.add_contact_detail('address', address, 'legislature')
        yield p
    os.system('rm /tmp/nl.pdf')
コード例 #27
0
    def get_people(self):
        response = urllib2.urlopen(COUNCIL_PAGE).read()
        pdf = open('/tmp/yt.pdf', 'w')
        pdf.write(response)
        pdf.close()

        data = subprocess.check_output(
            ['pdftotext', '-layout', '/tmp/yt.pdf', '-'])
        data = re.split(r'\n\s*\n', data)
        for municipality in data:

            if not 'Councillors' in municipality:
                continue
            lines = municipality.split('\n')
            if 'Page' in lines[0]:
                lines.pop(0)
                if not lines[0].strip():
                    lines.pop(0)
            col1end = re.search(r'\s{2,}(\w)', lines[0].strip()).end()
            col2end = re.search(r':\s{2,}(\w)', lines[0].strip()).end()

            if 'Council' in lines[1]:
                address = lines[2][:col1end -
                                   1].strip() + ' ' + lines[3][:col1end -
                                                               1].strip()
                district = lines[0][:col1end -
                                    1].strip() + ' ' + lines[1][:col1end -
                                                                1].strip()
            else:
                address = lines[1][:col1end -
                                   1].strip() + ' ' + lines[2][:col1end -
                                                               1].strip()
                district = lines[0][:col1end - 1].strip()

            organization = Organization(
                name=district + ' Council',
                classification='legislature',
                jurisdiction_id=self.jurisdiction.jurisdiction_id)
            organization.add_source(COUNCIL_PAGE)
            yield organization

            phone = re.findall(r'(?<=Phone: )\(?(\d{3}[\)-] ?\d{3}-\d{4})',
                               municipality)[0].replace(') ', '-')
            email = re.findall(r'(?<=E-mail:) (\S*)', municipality)[0]
            fax = None
            if 'Fax' in municipality:
                fax = re.findall(r'(?<=Fax: )\(?(\d{3}[\)-] ?\d{3}-\d{4})',
                                 municipality)[0].replace(') ', '-')
            website = None
            if 'Website' in municipality:
                website = re.findall(r'((http:\/\/|www.)(\S*))',
                                     municipality)[0][0]

            councillor_or_mayor = False
            for line in lines:
                if 'Mayor:' in line:
                    councillor_or_mayor = True
                    role = 'Mayor'
                    continue
                if 'Councillors' in line:
                    councillor_or_mayor = True
                    role = 'Councillor'
                    continue
                if councillor_or_mayor:
                    councillor = line[col1end - 1:col2end - 1].strip()
                    if not councillor:
                        continue
                    p = Legislator(name=councillor, post_id=district)
                    p.add_source(COUNCIL_PAGE)
                    membership = p.add_membership(organization,
                                                  role=role,
                                                  post_id=district)
                    membership.add_contact_detail('address', address,
                                                  'legislature')
                    membership.add_contact_detail('voice', phone,
                                                  'legislature')
                    membership.add_contact_detail('email', email, None)
                    if fax:
                        membership.add_contact_detail('fax', fax,
                                                      'legislature')
                    if website:
                        p.add_link(website, None)
                    yield p

        os.system('rm /tmp/yt.pdf')