Beispiel #1
0
    def handle_list_item(self, item):
        photo_url = item.xpath('./img/@src')[0]
        url = item.xpath('.//h5/a/@href')[0]
        name_text = item.xpath('.//h5/a/b/text()')[0]

        name_match = re.match(r'^(.+)\(([0-9]{2}[AB]), ([A-Z]+)\)$', name_text)
        name = name_match.group(1).strip()
        district = name_match.group(2).lstrip('0').upper()
        party_text = name_match.group(3)
        party = PARTIES[party_text]

        info_texts = [x.strip() for x in item.xpath(
            './div/text()[normalize-space()]'
        ) if x.strip()]
        address = '\n'.join((info_texts[0], info_texts[1]))

        phone_text = info_texts[2]
        if validate_phone_number(phone_text):
            phone = phone_text

        email_text = item.xpath('.//a/@href')[1].replace('mailto:', '').strip()
        if validate_email_address(email_text):
            email = email_text

        rep = Person(name=name, district=district, party=party,
                     primary_org='lower', role='Representative',
                     image=photo_url)
        rep.add_link(url)
        rep.add_contact_detail(type='address', value=address, note='capitol')
        rep.add_contact_detail(type='voice', value=phone, note='capitol')
        rep.add_contact_detail(type='email', value=email, note='capitol')
        rep.add_source(self.url)

        yield rep
Beispiel #2
0
    def _extract_email(self, doc):
        xpath = '//div[@class="districtheadleft"]' \
                + '/b[contains(text(), "Email:")]' \
                + '/../following-sibling::div' \
                + '/script/text()'
        script = doc.xpath(xpath)[0]
        line = filter(lambda line: '+ "@" +' in line, script.split('\r\n'))[0]
        parts = re.findall(r'"(.+?)"', line)

        email = ''.join(parts)

        return email if validate_email_address(email) else None
    def _extract_email(self, doc):
        xpath = '//div[@class="districtheadleft"]' \
                + '/b[contains(text(), "Email:")]' \
                + '/../following-sibling::div' \
                + '/script/text()'
        script = doc.xpath(xpath)[0]
        line = filter(
            lambda line: '+ "@" +' in line,
            script.split('\r\n'))[0]
        parts = re.findall(r'"(.+?)"', line)

        email = ''.join(parts)

        return email if validate_email_address(email) else None
Beispiel #4
0
    def _get_rep_email(self, district):
        # Get the email address from an input on the contact page
        url = "https://www.okhouse.gov/Members/Contact.aspx?District=" + district
        page = self.curl_lxmlize(url)

        try:
            email_node = page.get_element_by_id("txtMemberEmail")
            email = email_node.value
        except KeyError:
            email = None

        if email and not validate_email_address(email):
            email = None

        return email
Beispiel #5
0
    def handle_list_item(self, item):
        photo_url = item.xpath('./td[1]/a/img/@src')[0]
        info_nodes = item.xpath('./td[2]/p/a')
        name_text = info_nodes[0].xpath('./b/text()')[0]
        url = info_nodes[0].get('href')

        name_match = re.match(r'^(.+)\(([0-9]{2}[AB]), ([A-Z]+)\)$', name_text)
        name = name_match.group(1).strip()
        district = name_match.group(2).lstrip('0').upper()
        party_text = name_match.group(3)
        party = PARTIES[party_text]

        info_texts = [
            x.strip() for x in item.xpath(
                './td[2]/p/text()[normalize-space() and preceding-sibling::br]'
            ) if x.strip()
        ]
        address = '\n'.join((info_texts[0], info_texts[1]))

        phone_text = info_texts[2]
        if validate_phone_number(phone_text):
            phone = phone_text

        email_node = info_nodes[1]
        email_text = email_node.text
        email_text = email_text.replace('Email: ', '').strip()
        if validate_email_address(email_text):
            email = email_text

        rep = Person(name=name,
                     district=district,
                     party=party,
                     primary_org='lower',
                     role='Representative',
                     image=photo_url)
        rep.add_link(url)
        rep.add_contact_detail(type='address', value=address, note='capitol')
        rep.add_contact_detail(type='voice', value=phone, note='capitol')
        rep.add_contact_detail(type='email', value=email, note='capitol')
        rep.add_source(self.url)

        yield rep
Beispiel #6
0
    def handle_list_item(self, item):
        photo_url = item.xpath("./img/@src")[0]
        url = item.xpath(".//h5/a/@href")[0]
        name_text = item.xpath(".//h5/a/b/text()")[0]

        name_match = re.match(r"^(.+)\(([0-9]{2}[AB]), ([A-Z]+)\)$", name_text)
        name = name_match.group(1).strip()
        district = name_match.group(2).lstrip("0").upper()
        party_text = name_match.group(3)
        party = PARTIES[party_text]

        info_texts = [
            x.strip()
            for x in item.xpath("./div/text()[normalize-space()]")
            if x.strip()
        ]
        address = "\n".join((info_texts[0], info_texts[1]))

        phone_text = info_texts[2]
        if validate_phone_number(phone_text):
            phone = phone_text

        email_text = item.xpath(".//a/@href")[1].replace("mailto:", "").strip()
        if validate_email_address(email_text):
            email = email_text

        rep = Person(
            name=name,
            district=district,
            party=party,
            primary_org="lower",
            role="Representative",
            image=photo_url,
        )
        rep.add_link(url)
        rep.add_contact_detail(type="address", value=address, note="capitol")
        rep.add_contact_detail(type="voice", value=phone, note="capitol")
        rep.add_contact_detail(type="email", value=email, note="capitol")
        rep.add_source(self.url)

        yield rep
Beispiel #7
0
    def handle_list_item(self, item):
        photo_url = item.xpath('./td[1]/a/img/@src')[0]
        info_nodes = item.xpath('./td[2]/p/a')
        name_text = info_nodes[0].xpath('./b/text()')[0]
        url = info_nodes[0].get('href')

        name_match = re.match(r'^(.+)\(([0-9]{2}[AB]), ([A-Z]+)\)$', name_text)
        name = name_match.group(1).strip()
        district = name_match.group(2).lstrip('0').upper()
        party_text = name_match.group(3)
        party = PARTIES[party_text]

        info_texts = [x.strip() for x in item.xpath(
            './td[2]/p/text()[normalize-space() and preceding-sibling::br]'
        ) if x.strip()]
        address = '\n'.join((info_texts[0], info_texts[1]))

        phone_text = info_texts[2]
        if validate_phone_number(phone_text):
            phone = phone_text

        email_node = info_nodes[1]
        email_text = email_node.text
        email_text = email_text.replace('Email: ', '').strip()
        if validate_email_address(email_text):
            email = email_text

        rep = Person(name=name, district=district, party=party,
                     primary_org='lower', role='Representative',
                     image=photo_url)
        rep.add_link(url)
        rep.add_contact_detail(type='address', value=address)
        rep.add_contact_detail(type='voice', value=phone)
        rep.add_contact_detail(type='email', value=email)
        rep.add_source(self.url)

        yield rep
Beispiel #8
0
    def _scrape_lower_chamber(self, term):
        url = 'http://www.house.leg.state.mn.us/members/hmem.asp'

        page = self.lxmlize(url)

        legislator_nodes = self.get_nodes(
            page,
            '//div[@id="hide_show_alpha_all"]/table/tr/td/table/tr')

        for legislator_node in legislator_nodes:
            photo_url = self.get_node(
                legislator_node,
                './td[1]/a/img/@src')

            info_nodes = self.get_nodes(
                legislator_node,
                './td[2]/p/a')

            name_text = self.get_node(
                info_nodes[0],
                './b/text()')

            name_match = re.search(r'^.+\(', name_text)
            name = name_match.group(0)
            name = name.replace('(', '').strip()

            district_match = re.search(r'\([0-9]{2}[A-Z]', name_text)
            district_text = district_match.group(0)
            district = district_text.replace('(', '').lstrip('0').strip()

            party_match = re.search(r'[A-Z]+\)$', name_text)
            party_text = party_match.group(0)
            party_text = party_text.replace(')', '').strip()
            party = self._parties[party_text]

            info_texts = self.get_nodes(
                legislator_node,
                './td[2]/p/text()[normalize-space() and preceding-sibling'
                '::br]')
            address = '\n'.join((info_texts[0], info_texts[1]))

            phone_text = info_texts[2]
            if validate_phone_number(phone_text):
                phone = phone_text

            email_node = info_nodes[1]
            email_text = email_node.text

            email_text = email_text.replace('Email: ', '').strip()
            if validate_email_address(email_text):
                email = email_text

            legislator = Legislator(
                term=term,
                chamber='lower',
                district=district,
                full_name=name,
                party=party,
                email=email,
                photo_url=photo_url,
            )
            legislator.add_source(url)

            legislator.add_office(
                type='capitol',
                name="Capitol Office",
                address=address,
                phone=phone,
                email=email,
             )

            self.save_legislator(legislator)
Beispiel #9
0
    def _scrape_lower_chamber(self, term):
        url = 'http://www.house.leg.state.mn.us/members/hmem.asp'

        page = self.lxmlize(url)

        legislator_nodes = self.get_nodes(
            page,
            '//div[@id="hide_show_alpha_all"]/table/tr/td/table/tr')

        for legislator_node in legislator_nodes:
            photo_url = self.get_node(
                legislator_node,
                './td[1]/a/img/@src')

            info_nodes = self.get_nodes(
                legislator_node,
                './td[2]/p/a')

            name_text = self.get_node(
                info_nodes[0],
                './b/text()')

            name_match = re.search(r'^.+\(', name_text)
            name = name_match.group(0)
            name = name.replace('(', '').strip()

            district_match = re.search(r'\([0-9]{2}[A-Z]', name_text)
            district_text = district_match.group(0)
            district = district_text.replace('(', '').lstrip('0').strip()

            party_match = re.search(r'[A-Z]+\)$', name_text)
            party_text = party_match.group(0)
            party_text = party_text.replace(')', '').strip()
            party = self._parties[party_text]

            info_texts = self.get_nodes(
                legislator_node,
                './td[2]/p/text()[normalize-space() and preceding-sibling'
                '::br]')
            address = '\n'.join((info_texts[0], info_texts[1]))

            phone_text = info_texts[2]
            if validate_phone_number(phone_text):
                phone = phone_text

            email_node = info_nodes[1]
            email_text = email_node.text

            email_text = email_text.replace('Email: ', '').strip()
            if validate_email_address(email_text):
                email = email_text

            legislator = Legislator(
                term=term,
                chamber='lower',
                district=district,
                full_name=name,
                party=party,
                email=email,
                photo_url=photo_url,
            )
            legislator.add_source(url)

            legislator.add_office(
                type='capitol',
                name="Capitol Office",
                address=address,
                phone=phone,
                email=email,
             )

            self.save_legislator(legislator)
Beispiel #10
0
    def scrape_lower_chamber(self, term):
        url = 'http://www.house.leg.state.mn.us/members/hmem.asp'

        page = self.lxmlize(url)

        legislator_nodes = self.get_nodes(
            page,
            '//div[@id="hide_show_alpha_all"]/table/tr/td/table/tr')

        need_special_email_case = False

        for legislator_node in legislator_nodes:
            photo_url = self.get_node(
                legislator_node,
                './td[1]/a/img/@src')

            info_nodes = self.get_nodes(
                legislator_node,
                './td[2]/p/a')

            name_text = self.get_node(
                info_nodes[0],
                './b/text()')

            name_match = re.search(r'^.+\(', name_text)
            name = name_match.group(0)
            name = name.replace('(', '').strip()

            district_match = re.search(r'\([0-9]{2}[A-Z]', name_text)
            district_text = district_match.group(0)
            district = district_text.replace('(', '').lstrip('0').strip()

            party_match = re.search(r'[A-Z]+\)$', name_text)
            party_text = party_match.group(0)
            party_text = party_text.replace(')', '').strip()
            party = self._parties[party_text]

            info_texts = self.get_nodes(
                legislator_node,
                './td[2]/p/text()[normalize-space() and preceding-sibling'
                '::br]')
            address = '\n'.join((info_texts[0], info_texts[1]))

            phone_text = info_texts[2]
            if self._validate_phone_number(phone_text):
                phone = phone_text

            # E-mail markup is screwed-up and inconsistent.
            try:
                email_node = info_nodes[1]
                email_text = email_node.text
            except IndexError:
                # Primarily for Dan Fabian.
                email_node = info_texts[3]
                need_special_email_case = True

            email_text = email_text.replace('Email: ', '').strip()
            if validate_email_address(email_text):
                email = email_text

            legislator = Legislator(
                term=term,
                chamber='lower',
                district=district,
                full_name=name,
                party=party,
                email=email,
                photo_url=photo_url,
            )
            legislator.add_source(url)

            legislator.add_office(
                type='capitol',
                name="Capitol Office",
                address=address,
                phone=phone,
                email=email,
             )

            self.save_legislator(legislator)

        if not need_special_email_case:
            self.logger.warning('Special e-mail handling no longer required.')
    def _scrape_lower_chamber(self, term):
        url = 'http://www.house.leg.state.mn.us/members/hmem.asp'

        page = self.lxmlize(url)

        legislator_nodes = self.get_nodes(
            page, '//div[@id="hide_show_alpha_all"]/table/tr/td/table/tr')

        need_special_email_case = False

        for legislator_node in legislator_nodes:
            photo_url = self.get_node(legislator_node, './td[1]/a/img/@src')

            info_nodes = self.get_nodes(legislator_node, './td[2]/p/a')

            name_text = self.get_node(info_nodes[0], './b/text()')

            name_match = re.search(r'^.+\(', name_text)
            name = name_match.group(0)
            name = name.replace('(', '').strip()

            district_match = re.search(r'\([0-9]{2}[A-Z]', name_text)
            district_text = district_match.group(0)
            district = district_text.replace('(', '').lstrip('0').strip()

            party_match = re.search(r'[A-Z]+\)$', name_text)
            party_text = party_match.group(0)
            party_text = party_text.replace(')', '').strip()
            party = self._parties[party_text]

            info_texts = self.get_nodes(
                legislator_node,
                './td[2]/p/text()[normalize-space() and preceding-sibling'
                '::br]')
            address = '\n'.join((info_texts[0], info_texts[1]))

            phone_text = info_texts[2]
            if validate_phone_number(phone_text):
                phone = phone_text

            # E-mail markup is screwed-up and inconsistent.
            try:
                email_node = info_nodes[1]
                email_text = email_node.text
            except IndexError:
                # Primarily for Dan Fabian.
                email_node = info_texts[3]
                need_special_email_case = True

            email_text = email_text.replace('Email: ', '').strip()
            if validate_email_address(email_text):
                email = email_text

            legislator = Legislator(
                term=term,
                chamber='lower',
                district=district,
                full_name=name,
                party=party,
                email=email,
                photo_url=photo_url,
            )
            legislator.add_source(url)

            legislator.add_office(
                type='capitol',
                name="Capitol Office",
                address=address,
                phone=phone,
                email=email,
            )

            self.save_legislator(legislator)

        if not need_special_email_case:
            self.logger.warning('Special e-mail handling no longer required.')