コード例 #1
0
    def scrape(self):
        board_html = self.urlopen(self.COUNTY_BOARD_URL)
        board_lxml = lxml.html.fromstring(board_html)
        board_lxml.make_links_absolute(base_url=self.COUNTY_BOARD_URL)

        for board_member_lxml in board_lxml.cssselect("div[name=cbo_list] div[name=row]"):
            name = board_member_lxml.cssselect("div[name=info] strong")[0].text.strip()
            image = board_member_lxml.cssselect("div[name=pictures] img")[0].get('src')
            pieces = re.split(r'<br\s*\/?>', lxml.html.tostring(board_member_lxml.cssselect("div[name=info]")[0]).decode(), re.I)
            position = re.sub(r'<[^>]*>', '', pieces[1]).strip()
            links = board_member_lxml.cssselect("div[name=info] a")
            email = bio_link = None
            for link in links:
                if link.text is None:
                    continue
                if 'arlingtonva.us' in link.text.lower():
                    email = re.sub(r'\s*\(at\)\s*','@', link.text).strip()
                elif 'bio' in link.text.lower():
                    bio_link = link

            legislator = Legislator(name=name, district=position, image=image)
            legislator.add_contact(type='email', value=email, note='%(name)s email address' % {'name': name} )
            legislator.add_source(self.COUNTY_BOARD_URL)

            bio = None
            if bio_link is not None:
                bio_href = bio_link.attrib.get('href')
                bio_html = self.urlopen(bio_href)
                bio_lxml = lxml.html.fromstring(bio_html)
                bio_text = re.sub(r'<[^>]*>', '', lxml.html.tostring(bio_lxml.cssselect('#textSection #text')[0]).decode(), re.I).strip()
                bio_text = re.sub(r'&#160;', ' ', bio_text)
                legislator.biography = bio_text
                legislator.add_link('bio page', bio_href)

            yield legislator
コード例 #2
0
    def get_people(self):
        board_html = self.urlopen(self.COUNTY_BOARD_URL)
        board_lxml = lxml.html.fromstring(board_html)
        board_lxml.make_links_absolute(base_url=self.COUNTY_BOARD_URL)

        for board_member_lxml in board_lxml.cssselect(
                "div[name=cbo_list] div[name=row]"):
            name = board_member_lxml.cssselect(
                "div[name=info] strong")[0].text.strip()
            image = board_member_lxml.cssselect(
                "div[name=pictures] img")[0].get('src')
            position = re.sub(
                r'<[^>]*>', '',
                re.split(
                    r'<br\s*\/?>',
                    lxml.html.tostring(
                        board_member_lxml.cssselect("div[name=info]")[0]),
                    re.I)[1]).strip()
            links = board_member_lxml.cssselect("div[name=info] a")
            email = bio_link = None
            for link in links:
                if link.text is None:
                    continue
                if 'arlingtonva.us' in link.text.lower():
                    email = re.sub(r'\s*\(at\)\s*', '@', link.text).strip()
                elif 'bio' in link.text.lower():
                    bio_link = link

            legislator = Legislator(name=name, post_id=position, image=image)
            legislator.add_contact(type='email',
                                   value=email,
                                   note='%(name)s email address' %
                                   {'name': name})
            legislator.add_source(self.COUNTY_BOARD_URL)

            bio = None
            if bio_link is not None:
                bio_href = bio_link.attrib.get('href')
                bio_html = self.urlopen(bio_href)
                bio_lxml = lxml.html.fromstring(bio_html)
                bio_text = re.sub(
                    r'<[^>]*>', '',
                    lxml.html.tostring(
                        bio_lxml.cssselect('#textSection #text')[0]),
                    re.I).strip()
                bio_text = re.sub(r'&#160;', ' ', bio_text)
                legislator.biography = bio_text
                legislator.add_link('bio page', bio_href)

            yield legislator
コード例 #3
0
    def get_people(self):
        # committee
        tech = Organization('Technology', classification='committee')
        tech.add_post('Chairman', 'chairman')
        tech.add_source('https://example.com')
        yield tech

        # subcommittee
        ecom = Organization('Subcommittee on E-Commerce',
                            parent=tech,
                            classification='committee')
        ecom.add_source('https://example.com')
        yield ecom

        p = Legislator('Paul Tagliamonte', '6')
        p.add_membership(tech, role='chairman')
        p.add_source('https://example.com')
        yield p
コード例 #4
0
ファイル: parser.py プロジェクト: paultag/ssu
def people_to_pupa(stream, transaction):
    org = Organization(
        name=transaction.jurisdiction.name,
        classification='legislature',
    )

    for row in stream:
        # XXX: Validate the row better.
        name = row.get("Name", "").strip()
        district = row.get("District", "").strip()

        if not name:
            raise ValueError("A name is required for each entry.")

        if not district:
            raise ValueError("A district is required for each entry.")

        obj = Legislator(name=name, district=district)
        org.add_post(label=district, role="member")

        for key, keys in [
            ("email", ("Email 1", "Email 2", "Email 3")),
            ("address", ("Address 1", "Address 2", "Address 3")),
            ("voice", ("Phone 1", "Phone 2", "Phone 3")),
        ]:
            for k in keys:
                value = row.get(k)
                if value:
                    obj.add_contact_detail(type=key, value=value, note=k)

        obj.add_source(url=OCD_SOURCE_URL)
        obj.validate()
        obj.pre_save(transaction.jurisdiction.id)

        yield obj

        for related in obj._related:
            yield related

    for related in org._related:
        yield related
    yield org
コード例 #5
0
    def get_people(self):
        for councilman, committees in self.councilMembers():
            contact_types = {
                "City Hall Office": ("address", "City Hall Office"),
                "City Hall Phone": ("phone", "City Hall Phone"),
                "Ward Office Phone": ("phone", "Ward Office Phone"),
                "Ward Office Address": ("address", "Ward Office Address"),
                "Fax": ("fax", "Fax")
            }

            contacts = []
            for contact_type, (_type, note) in contact_types.items():
                if councilman[contact_type]:
                    contacts.append({
                        "type": _type,
                        "value": councilman[contact_type],
                        "note": note
                    })

            if councilman["E-mail"]:
                contacts.append({
                    "type": "email",
                    "value": councilman['E-mail']['label'],
                    'note': 'E-mail'
                })

            p = Legislator(councilman['Person Name']['label'],
                           post_id="Ward %s" % (councilman['Ward/Office']),
                           image=councilman['Photo'],
                           contact_details=contacts)

            if councilman['Website']:
                p.add_link('homepage', councilman['Website']['url'])
            p.add_source(MEMBERLIST)

            for committee, _, _ in committees:
                if committee['Legislative Body']['label']:
                    print(committee)
                    if committee['Legislative Body']['label'] not in (
                            'City Council', 'Office of the Mayor'):
                        p.add_committee_membership(
                            committee['Legislative Body']['label'],
                            role=committee["Title"])

            yield p
コード例 #6
0
ファイル: parser.py プロジェクト: paultag/ssu
def people_to_pupa(stream, transaction):
    org = Organization(name=transaction.jurisdiction.name, classification="legislature")

    for row in stream:
        # XXX: Validate the row better.
        name = row.get("Name", "").strip()
        district = row.get("District", "").strip()

        if not name:
            raise ValueError("A name is required for each entry.")

        if not district:
            raise ValueError("A district is required for each entry.")

        obj = Legislator(name=name, district=district)
        org.add_post(label=district, role="member")

        for key, keys in [
            ("email", ("Email 1", "Email 2", "Email 3")),
            ("address", ("Address 1", "Address 2", "Address 3")),
            ("voice", ("Phone 1", "Phone 2", "Phone 3")),
        ]:
            for k in keys:
                value = row.get(k)
                if value:
                    obj.add_contact_detail(type=key, value=value, note=k)

        obj.add_source(url=OCD_SOURCE_URL)
        obj.validate()
        obj.pre_save(transaction.jurisdiction.id)

        yield obj

        for related in obj._related:
            yield related

    for related in org._related:
        yield related
    yield org
コード例 #7
0
    def get_people(self):
        for councilman, committees in self.councilMembers() :
            contact_types = {
                "City Hall Office": ("address", "City Hall Office"),
                "City Hall Phone": ("phone", "City Hall Phone"),
                "Ward Office Phone": ("phone", "Ward Office Phone"),
                "Ward Office Address": ("address", "Ward Office Address"),
                "Fax": ("fax", "Fax")
            }

            contacts = []
            for contact_type, (_type, note) in contact_types.items () :
                if councilman[contact_type] :
                    contacts.append({"type": _type,
                                     "value": councilman[contact_type],
                                     "note": note})

            if councilman["E-mail"] :
                contacts.append({"type" : "email",
                                 "value" : councilman['E-mail']['label'],
                                 'note' : 'E-mail'})


            p = Legislator(councilman['Person Name']['label'],
                           post_id="Ward %s" % (councilman['Ward/Office']),
                           image=councilman['Photo'],
                           contact_details = contacts)


            if councilman['Website'] :
                p.add_link('homepage', councilman['Website']['url'])
            p.add_source(MEMBERLIST)

            for committee, _, _ in committees :
                if committee['Legislative Body']['label'] :
                    print(committee)
                    if committee['Legislative Body']['label'] not in ('City Council', 'Office of the Mayor') :
                        p.add_committee_membership(committee['Legislative Body']['label'],
                                                   role= committee["Title"])



            yield p