def _scrape_people(self):
        url = 'http://www.cabq.gov/council/councilors'
        page = self.lxmlize(url)
        names = page.xpath("//div[@id='parent-fieldname-text']/*")[3:]
        it = iter(names)
        for entry in zip(it, it, it):
            name, info, _ = entry
            image_small = name.xpath(".//img")[0].attrib['src']
            name = name.text_content()
            infopage, email, policy_analyst = info.xpath(".//a")
            phone = info.xpath(".//b")[-1].tail.strip()
            district = infopage.text_content()
            homepage = self.lxmlize(infopage.attrib['href'])
            photo = homepage.xpath(
                "//div[@class='featureContent']//img"
            )[0].attrib['src']

            bio = "\n".join((x.text_content() for x in homepage.xpath(
                "//div[@class='featureContent']//div[@class='stx']/p")))

            p = Legislator(name=name,
                           post_id=district,
                           image=photo,
                           biography=bio)

            p.add_source(url)
            p.add_source(infopage.attrib['href'])
            yield p
Beispiel #2
0
def test_legislator_related_party():
    l = Legislator('John Adams', district='1', party='Democratic-Republican')
    l.pre_save('jurisdiction-id')

    # a party membership
    assert len(l._related) == 2
    assert l._related[1].person_id == l._id
    assert get_psuedo_id(l._related[1].organization_id) == {'classification': 'party',
                                                            'name': 'Democratic-Republican'}
    assert l._related[1].role == 'member'
Beispiel #3
0
def test_legislator_related_district():
    l = Legislator('John Adams', district='1')
    l.pre_save('jurisdiction-id')

    assert len(l._related) == 1
    assert l._related[0].person_id == l._id
    assert get_psuedo_id(l._related[0].organization_id) == {'chamber': '',
                                                            'classification': 'legislature'}
    assert get_psuedo_id(l._related[0].post_id) == {
        "label": "1"
    }
    assert l._related[0].role == 'member'
    def scrape_homepage(self, folk):
        url = folk.attrib["href"]
        page = self.lxmlize(url)
        image = page.xpath("//img[contains(@src, 'uploadedImages/City_Council/Members/')]")[0].attrib["src"]

        name = page.xpath("//div[@id='ctl00_ctl00_Body_body_cntCommon']/h3")
        name, = name

        bio = "\n\n".join([x.text_content() for x in page.xpath("//div[@id='ctl00_ctl00_Body_body_cntCommon']/p")])

        leg = Legislator(name=name.text, district="member", biography=bio, image=image)
        leg.add_source(url)
        return leg
Beispiel #5
0
    def get_people(self):

        html = self.urlopen(self.url)
        doc = lxml.html.fromstring(html)

        title_xpath = '//div[contains(@class, "biotitle")]'
        name_xpath = '//div[contains(@class, "bioname")]'
        for title, name in zip(doc.xpath(title_xpath), doc.xpath(name_xpath)):
            name = name.text_content().strip()
            title = title.text_content().strip()
            p = Legislator(name=name, post_id=title)
            p.add_source(self.url)
            yield p
    def bos_scrape_people(self):
        page = self.lxmlize(MEMBER_LIST)
        people = page.xpath(
            "//table[@width='100%']//td[@style='TEXT-ALIGN: center']")

        for person in people:
            image, name = [
                self.get_one(person, x) for x in [
                    ".//img",
                    ".//a[contains(@href, 'councillors') and (text()!='')]"
                ]
            ]
            role = person.xpath(".//br")[0].tail.strip()
            image = image.attrib[
                'src']  # Fallback if we don't get one from the
            # homepage.
            homepage = name.attrib['href']
            name = clean_name(name.text)
            info = self.scrape_homepage(homepage)
            if info.get('image', None):
                image = info['image']

            p = Legislator(name=name,
                           post_id=role,
                           image=image,
                           biography=info['bio'])
            p.add_link(homepage, 'homepage')
            p.add_source(homepage)
            p.add_source(MEMBER_LIST)
            yield p
    def cleveland_scrape_people(self):
        listing = "http://www.clevelandcitycouncil.org/council-members/"
        page = self.lxmlize(listing)

        table = page.xpath("//div[@class='standard-content column']//table")[0]
        for person in table.xpath(".//td[@align='center']"):
            strong = person.xpath(".//strong")[0]
            who = strong.text.strip()
            role = strong.xpath("./br")[0].tail.strip()
            img = person.xpath(".//img")[0].attrib['src']
            info = INFOSLUG.match(role).groupdict()

            scraped_info = {}
            page = person.xpath(".//a")
            if page != []:
                page = page[0].attrib['href']
                scraped_info = self.scrape_page(page)

            kwargs = {}
            biography = scraped_info.get('bio', None)
            if biography:
                kwargs['biography'] = biography

            p = Legislator(name=who,
                           district=info['district'],
                           gender=info['gender'],
                           image=img, **kwargs)
            p.add_source(listing)

            valid_titles = [
                "Chair",
                "Vice Chair"
            ]

            for what in scraped_info.get('committees', []):
                what = what.strip()
                if what == "":
                    continue

                role = "member"
                if "-" in what:
                    c, title = (x.strip() for x in what.rsplit("-", 1))
                    if title in valid_titles:
                        what = c
                        role = title
                p.add_committee_membership(what, role=role)
            yield p
Beispiel #8
0
    def get_people(self):
        # committee
        tech = Organization('Technology', classification='committee')
        tech.add_post('Chairman', 'chairman')
        yield tech

        # subcommittee
        ecom = Organization('Subcommittee on E-Commerce',
                            parent=tech,
                            classification='committee')
        yield ecom

        p = Legislator('Paul Tagliamonte', district='6', chamber='upper',
                       party='Independent')
        p.add_committee_membership('Finance')
        p.add_membership(tech, role='chairman')
        yield p
    def scrape_ward(self, el):
        url = el.attrib['href']
        page = self.lxmlize(url)
        name = page.xpath("//div[@id='content-content']/h3")[0].text_content()
        badthings = [
            "Alderman"
        ]
        for thing in badthings:
            if name.startswith(thing):
                name = name[len(thing):].strip()

        district = page.xpath("//h1[@class='page-heading']/text()")[0]
        leg = Legislator(name=name, post_id=district)
        leg.add_source(url)

        type_types = {
            "City Hall Office:": ("address", "City Hall Office"),
            "City Hall Phone:": ("phone", "City Hall Phone"),
            "Phone:": ("phone", "Personal Phone"),
            "Office:": ("address", "Personal Office"),
            "Fax:": ("fax", "Fax"),
            "Fax": ("fax", "Fax"),
        }

        for row in page.xpath("//table//tr"):
            type_, val = (x.text_content().strip() for x in row.xpath("./td"))
            if val == "":
                continue

            types = [type_]
            vals = [val]

            if "\n" in type_:
                if "\n" in val:
                    types = type_.split("\n")
                    vals = val.split("\n")
                else:
                    continue

            for type_ in types:
                for val in vals:
                    ctype, note = type_types[type_]
                    leg.add_contact(ctype, val, note)

        return leg
    def bos_scrape_people(self):
        page = self.lxmlize(MEMBER_LIST)
        people = page.xpath(
            "//table[@width='100%']//td[@style='TEXT-ALIGN: center']")

        for person in people:
            image, name = [self.get_one(person, x) for x in [
                ".//img",
                ".//a[contains(@href, 'councillors') and (text()!='')]"
            ]]
            role = person.xpath(".//br")[0].tail.strip()
            image = image.attrib['src']  # Fallback if we don't get one from the
            # homepage.
            homepage = name.attrib['href']
            name = clean_name(name.text)
            info = self.scrape_homepage(homepage)
            if info.get('image', None):
                image = info['image']

            p = Legislator(name=name,
                           post_id=role,
                           image=image,
                           biography=info['bio'])
            p.add_link(homepage, 'homepage')
            p.add_source(homepage)
            p.add_source(MEMBER_LIST)
            yield p
    def scrape(self):
        page = self.lxmlize(MEMBER_LIST)
        for row in page.xpath("//table[@frame='void']/tbody/tr")[1:]:
            role, whos, expire = row.xpath("./*")
            people = zip([x.text_content() for x in whos.xpath(".//font")],
                         [x.text_content() for x in expire.xpath(".//font")])
            thing = role.text_content()

            comm = Committee(name=thing)
            url = role.xpath(".//a")[0].attrib['href']
            comm.add_link(url=url, note='homepage')

            for person, expire in people:
                if "TBA" in person:
                    continue
                info = {}

                try:
                   info = re.match("(?P<name>.*), (?P<addr>\d+\w* .*)",
                                   person).groupdict()
                except AttributeError:
                    info = re.match("(?P<name>.*) (?P<addr>\d+\w* .*)",
                                    person).groupdict()

                addr = info['addr']

                roles = {"Vice Chair": "Vice Chair",
                         "Chair": "Chair",
                         "CHAIR": "Chair",
                         "Appt": "member",}

                position = "member"

                if "Resigned" in addr:
                    continue

                for role in roles:
                    if role in addr:
                        addr, chair = [x.strip() for x in addr.rsplit(role, 1)]
                        position = roles[role]

                addr = clean_address(addr)
                leg = Legislator(name=info['name'], district=position)
                leg.add_contact_detail(type="address",
                                       value=addr,
                                       note="Address")
                leg.add_source(MEMBER_LIST)
                yield leg

                leg.add_membership(comm)
            comm.add_source(MEMBER_LIST)
            yield comm
    def scrape_homepage(self, folk):
        url = folk.attrib['href']
        page = self.lxmlize(url)
        image = page.xpath(
            "//img[contains(@src, 'uploadedImages/City_Council/Members/')]"
        )[0].attrib['src']

        name = page.xpath("//div[@id='ctl00_ctl00_Body_body_cntCommon']/h3")
        name, = name

        bio = "\n\n".join([
            x.text_content() for x in page.xpath(
                "//div[@id='ctl00_ctl00_Body_body_cntCommon']/p")
        ])

        leg = Legislator(name=name.text,
                         post_id='member',
                         biography=bio,
                         image=image)
        leg.add_source(url)
        return leg
    def _scrape_people(self):
        url = 'http://www.cabq.gov/council/councilors'
        page = self.lxmlize(url)
        names = page.xpath("//div[@id='parent-fieldname-text']/*")[3:]
        it = iter(names)
        for entry in zip(it, it, it):
            name, info, _ = entry
            image_small = name.xpath(".//img")[0].attrib['src']
            name = name.text_content()
            infopage, email, policy_analyst = info.xpath(".//a")
            phone = info.xpath(".//b")[-1].tail.strip()
            district = infopage.text_content()
            homepage = self.lxmlize(infopage.attrib['href'])
            photo = homepage.xpath(
                "//div[@class='featureContent']//img")[0].attrib['src']

            bio = "\n".join((x.text_content() for x in homepage.xpath(
                "//div[@class='featureContent']//div[@class='stx']/p")))

            p = Legislator(name=name,
                           district=district,
                           image=photo,
                           biography=bio)

            p.add_source(url)
            p.add_source(infopage.attrib['href'])
            yield p
Beispiel #14
0
    def get_people(self):
        people = [
            {"name": "Mckenzie A. Cannon", "district": "10a",},
            {"name": "Yandel V. Watkins",
             "district": "Second Fnord and Norfolk",},
            {"name": "Adrien A. Coffey", "district": "A",},
            {"district": "10c", "name": "Natasha Moon",},
            {"district": "Berkshire, Hampshire, Franklin and Hampden",
             "name": "Ramon Harmon",},
            {"district": "5", "name": "Sam Sellers",},
            {"district": "6", "name": "Estrella Hahn",},
            {"district": "B",  "name": "Teagan Rojas",},
            {"district": "C", "name": "Barrett Adams",},
            {"district": "D", "name": "Kayla Shelton",},
            {"district": "E", "name": "Kohen Dudley",},
            {"district": "F", "name": "Cayden Norman",},
            {"district": "ZZ", "name": "Shayla Fritz",},
            {"district": "Ward 2", "name": "Gunnar Luna",},
            {"district": "Green", "name": "Regina Cruz",},
            {"district": "Blue", "name": "Makenzie Keller",},
            {"district": "Red", "name": "Eliana Meyer",},
            {"district": "Yellow", "name": "Taylor Parrish",},
            {"district": "Silver", "name": "Callie Craig",},
        ]

        for person in people:
            l = Legislator(**person)
            l.add_source("http://example.com")
            dslug = (
                person['district'].lower().replace(" ", "-").replace(",", ""))
            l.add_contact_detail(
                type='email',
                value="*****@*****.**" % (dslug),
                note='office email'
            )
            yield l
Beispiel #15
0
    def nyc_scrape_people(self):
        page = self.lxmlize(MEMBER_PAGE)
        for entry in page.xpath("//table[@id='members_table']//tr"):
            entries = entry.xpath(".//td")
            if entries == []:
                continue

            name, district, borough, party = entries
            name = name.xpath(".//a")[0]
            homepage = name.attrib['href']
            name, district, borough, party = [
                x.text for x in [name, district, borough, party]
            ]

            info = self.scrape_homepage(homepage)
            p = Legislator(
                name=name,
                post_id=district,
                # borough=borough,
                party=party.strip() or "other")
            p.add_link(homepage, 'homepage')
            p.add_source(homepage)
            p.add_source(MEMBER_PAGE)
            yield p
    def cleveland_scrape_people(self):
        listing = "http://www.clevelandcitycouncil.org/council-members/"
        page = self.lxmlize(listing)

        table = page.xpath("//div[@class='standard-content column']//table")[0]
        for person in table.xpath(".//td[@align='center']"):
            strong = person.xpath(".//strong")[0]
            who = strong.text.strip()
            role = strong.xpath("./br")[0].tail.strip()
            img = person.xpath(".//img")[0].attrib['src']
            info = INFOSLUG.match(role).groupdict()

            scraped_info = {}
            page = person.xpath(".//a")
            if page != []:
                page = page[0].attrib['href']
                scraped_info = self.scrape_page(page)

            kwargs = {}
            biography = scraped_info.get('bio', None)
            if biography:
                kwargs['biography'] = biography

            p = Legislator(name=who,
                           post_id=info['district'],
                           gender=info['gender'],
                           image=img,
                           **kwargs)
            p.add_source(listing)

            valid_titles = ["Chair", "Vice Chair"]

            for what in scraped_info.get('committees', []):
                what = what.strip()
                if what == "":
                    continue

                role = "member"
                if "-" in what:
                    c, title = (x.strip() for x in what.rsplit("-", 1))
                    if title in valid_titles:
                        what = c
                        role = title
                p.add_committee_membership(what, role=role)
            yield p
    def nyc_scrape_people(self):
        page = self.lxmlize(MEMBER_PAGE)
        for entry in page.xpath("//table[@id='members_table']//tr"):
            entries = entry.xpath(".//td")
            if entries == []:
                continue

            name, district, borough, party = entries
            name = name.xpath(".//a")[0]
            homepage = name.attrib['href']
            name, district, borough, party = [x.text for x in
                                              [name, district, borough, party]]

            info = self.scrape_homepage(homepage)
            p = Legislator(name=name,
                           post_id=district,
                           # borough=borough,
                           party=party.strip() or "other")
            p.add_link(homepage, 'homepage')
            p.add_source(homepage)
            p.add_source(MEMBER_PAGE)
            yield p
    def scrape_ward(self, el):
        url = el.attrib['href']
        page = self.lxmlize(url)
        name = page.xpath("//div[@id='content-content']/h3")[0].text_content()
        badthings = ["Alderman"]
        for thing in badthings:
            if name.startswith(thing):
                name = name[len(thing):].strip()

        district = page.xpath("//h1[@class='page-heading']/text()")[0]
        leg = Legislator(name=name, post_id=district)
        leg.add_source(url)

        type_types = {
            "City Hall Office:": ("address", "City Hall Office"),
            "City Hall Phone:": ("phone", "City Hall Phone"),
            "Phone:": ("phone", "Personal Phone"),
            "Office:": ("address", "Personal Office"),
            "Fax:": ("fax", "Fax"),
            "Fax": ("fax", "Fax"),
        }

        for row in page.xpath("//table//tr"):
            type_, val = (x.text_content().strip() for x in row.xpath("./td"))
            if val == "":
                continue

            types = [type_]
            vals = [val]

            if "\n" in type_:
                if "\n" in val:
                    types = type_.split("\n")
                    vals = val.split("\n")
                else:
                    continue

            for type_ in types:
                for val in vals:
                    ctype, note = type_types[type_]
                    leg.add_contact(ctype, val, note)

        return leg
Beispiel #19
0
    def get_people(self):
        people = [
            {
                "name": "Mckenzie A. Cannon",
                "district": "10a",
            },
            {
                "name": "Yandel V. Watkins",
                "district": "Second Fnord and Norfolk",
            },
            {
                "name": "Adrien A. Coffey",
                "district": "A",
            },
            {
                "district": "10c",
                "name": "Natasha Moon",
            },
            {
                "district": "Berkshire, Hampshire, Franklin and Hampden",
                "name": "Ramon Harmon",
            },
            {
                "district": "5",
                "name": "Sam Sellers",
            },
            {
                "district": "6",
                "name": "Estrella Hahn",
            },
            {
                "district": "B",
                "name": "Teagan Rojas",
            },
            {
                "district": "C",
                "name": "Barrett Adams",
            },
            {
                "district": "D",
                "name": "Kayla Shelton",
            },
            {
                "district": "E",
                "name": "Kohen Dudley",
            },
            {
                "district": "F",
                "name": "Cayden Norman",
            },
            {
                "district": "ZZ",
                "name": "Shayla Fritz",
            },
            {
                "district": "Ward 2",
                "name": "Gunnar Luna",
            },
            {
                "district": "Green",
                "name": "Regina Cruz",
            },
            {
                "district": "Blue",
                "name": "Makenzie Keller",
            },
            {
                "district": "Red",
                "name": "Eliana Meyer",
            },
            {
                "district": "Yellow",
                "name": "Taylor Parrish",
            },
            {
                "district": "Silver",
                "name": "Callie Craig",
            },
        ]

        for person in people:
            l = Legislator(**person)
            l.add_source("http://example.com")
            dslug = (person['district'].lower().replace(" ",
                                                        "-").replace(",", ""))
            l.add_contact_detail(type='email',
                                 value="*****@*****.**" % (dslug),
                                 note='office email')
            yield l
Beispiel #20
0
  def scrape_mayor(self):
    url = 'http://www1.toronto.ca/wps/portal/contentonly?vgnextoid=e53332d0b6d1e310VgnVCM10000071d60f89RCRD&vgnextfmt=default'
    page = self.lxmlize(url)
    name = page.xpath("//div[@class='detail']//h1/text()")[0].replace("Toronto Mayor","").strip()
    p = Legislator(name,"Toronto")
    
    p.add_source(COUNCIL_PAGE)
    p.add_source(url)

    url = page.xpath('//a[contains(text(),"Contact the Mayor")]')[0].attrib['href']
    p.add_source(url)
    page = self.lxmlize(url)

    info = page.xpath('//div[@class="detail"]')[0]
    address = (', ').join(info.xpath('.//p/text()')[0:6]).replace(",,",",")
    phone = info.xpath('.//p[3]/text()')[0]
    
    p.add_contact('address',address,'Mailing')
    p.add_contact('phone',phone,'')



#t = Toronto()
#s = TorontoPersonScraper(t,'m','/Users/alexio/Desktop',)
#s.toronto_scrape_people()
#s.toronto_scrape_people()
Beispiel #21
0
  def scrape_councilor(self, url):
    page = self.lxmlize(url)
    info = page.xpath("//div[@class='main']")[0]
    name = info.xpath("//h3")[1].text_content().replace('Councillor','').strip()
    district = info.xpath("//p")[0].text_content()
    p = Legislator(name=name, district=district)
    
    info = info.xpath("//div[@class='last']")[0]

    # add links
    p.add_source(url)
    p.add_source(COUNCIL_PAGE)
     
    if "website:" in info.text_content():
      p.add_link(info.xpath('.//a')[1].attrib['href'], 'homepage')

    if "Facebook" in info.text_content():
      p.add_link(info.xpath('//a[contains(@href, "facebook.com")]')[0].attrib['href'],'facebook')
   
    if "Twitter" in info.text_content():
      p.add_link(info.xpath('//a[contains(@href,"twitter.com")]')[0].attrib['href'],'twitter') 
    
    # add contact info
    p.add_contact('email', info.xpath('.//a')[0].text_content(),'')
   #//*[@id="content"]/div/div[1]/div[2]/p[1]
    contacts = info.xpath('//div/p[text()[contains(.,"Phone:")]]')
    for contact in contacts:
      note = contact.xpath('.//strong')[0].text_content()
      contact = contact.xpath('br/following-sibling::node()')
      if len(contact) > 8 : continue
      if len(contact) >= 4:
        address = (contact[0]+", "+contact[2]).strip()
        p.add_contact('address',address,note)
        if "Phone: " in contact[4]: 
          phone = contact[4].replace("Phone: ",'').strip()
          p.add_contact('phone',phone,note)
        if len(contact) > 5 and "Fax:" in contact[6]: 
          fax = contact[6].replace("Fax: ",'').strip()
          p.add_contact('fax',fax,note) 
      else: 
        phone = contact[0].strip()
        p.add_contact('phone',phone,note)
        fax = contact[2].strip()
        p.add_contact('fax',fax,note)