def handle_list_item(self, item): photo_url = item.xpath('./img/@src')[0] url = item.xpath('.//h5/a/@href')[0] name_text = item.xpath('.//h5/a/b/text()')[0] name_match = re.match(r'^(.+)\(([0-9]{2}[AB]), ([A-Z]+)\)$', name_text) name = name_match.group(1).strip() district = name_match.group(2).lstrip('0').upper() party_text = name_match.group(3) party = PARTIES[party_text] info_texts = [x.strip() for x in item.xpath( './div/text()[normalize-space()]' ) if x.strip()] address = '\n'.join((info_texts[0], info_texts[1])) phone_text = info_texts[2] if validate_phone_number(phone_text): phone = phone_text email_text = item.xpath('.//a/@href')[1].replace('mailto:', '').strip() if validate_email_address(email_text): email = email_text rep = Person(name=name, district=district, party=party, primary_org='lower', role='Representative', image=photo_url) rep.add_link(url) rep.add_contact_detail(type='address', value=address, note='capitol') rep.add_contact_detail(type='voice', value=phone, note='capitol') rep.add_contact_detail(type='email', value=email, note='capitol') rep.add_source(self.url) yield rep
def _extract_email(self, doc): xpath = '//div[@class="districtheadleft"]' \ + '/b[contains(text(), "Email:")]' \ + '/../following-sibling::div' \ + '/script/text()' script = doc.xpath(xpath)[0] line = filter(lambda line: '+ "@" +' in line, script.split('\r\n'))[0] parts = re.findall(r'"(.+?)"', line) email = ''.join(parts) return email if validate_email_address(email) else None
def _extract_email(self, doc): xpath = '//div[@class="districtheadleft"]' \ + '/b[contains(text(), "Email:")]' \ + '/../following-sibling::div' \ + '/script/text()' script = doc.xpath(xpath)[0] line = filter( lambda line: '+ "@" +' in line, script.split('\r\n'))[0] parts = re.findall(r'"(.+?)"', line) email = ''.join(parts) return email if validate_email_address(email) else None
def _get_rep_email(self, district): # Get the email address from an input on the contact page url = "https://www.okhouse.gov/Members/Contact.aspx?District=" + district page = self.curl_lxmlize(url) try: email_node = page.get_element_by_id("txtMemberEmail") email = email_node.value except KeyError: email = None if email and not validate_email_address(email): email = None return email
def handle_list_item(self, item): photo_url = item.xpath('./td[1]/a/img/@src')[0] info_nodes = item.xpath('./td[2]/p/a') name_text = info_nodes[0].xpath('./b/text()')[0] url = info_nodes[0].get('href') name_match = re.match(r'^(.+)\(([0-9]{2}[AB]), ([A-Z]+)\)$', name_text) name = name_match.group(1).strip() district = name_match.group(2).lstrip('0').upper() party_text = name_match.group(3) party = PARTIES[party_text] info_texts = [ x.strip() for x in item.xpath( './td[2]/p/text()[normalize-space() and preceding-sibling::br]' ) if x.strip() ] address = '\n'.join((info_texts[0], info_texts[1])) phone_text = info_texts[2] if validate_phone_number(phone_text): phone = phone_text email_node = info_nodes[1] email_text = email_node.text email_text = email_text.replace('Email: ', '').strip() if validate_email_address(email_text): email = email_text rep = Person(name=name, district=district, party=party, primary_org='lower', role='Representative', image=photo_url) rep.add_link(url) rep.add_contact_detail(type='address', value=address, note='capitol') rep.add_contact_detail(type='voice', value=phone, note='capitol') rep.add_contact_detail(type='email', value=email, note='capitol') rep.add_source(self.url) yield rep
def handle_list_item(self, item): photo_url = item.xpath("./img/@src")[0] url = item.xpath(".//h5/a/@href")[0] name_text = item.xpath(".//h5/a/b/text()")[0] name_match = re.match(r"^(.+)\(([0-9]{2}[AB]), ([A-Z]+)\)$", name_text) name = name_match.group(1).strip() district = name_match.group(2).lstrip("0").upper() party_text = name_match.group(3) party = PARTIES[party_text] info_texts = [ x.strip() for x in item.xpath("./div/text()[normalize-space()]") if x.strip() ] address = "\n".join((info_texts[0], info_texts[1])) phone_text = info_texts[2] if validate_phone_number(phone_text): phone = phone_text email_text = item.xpath(".//a/@href")[1].replace("mailto:", "").strip() if validate_email_address(email_text): email = email_text rep = Person( name=name, district=district, party=party, primary_org="lower", role="Representative", image=photo_url, ) rep.add_link(url) rep.add_contact_detail(type="address", value=address, note="capitol") rep.add_contact_detail(type="voice", value=phone, note="capitol") rep.add_contact_detail(type="email", value=email, note="capitol") rep.add_source(self.url) yield rep
def handle_list_item(self, item): photo_url = item.xpath('./td[1]/a/img/@src')[0] info_nodes = item.xpath('./td[2]/p/a') name_text = info_nodes[0].xpath('./b/text()')[0] url = info_nodes[0].get('href') name_match = re.match(r'^(.+)\(([0-9]{2}[AB]), ([A-Z]+)\)$', name_text) name = name_match.group(1).strip() district = name_match.group(2).lstrip('0').upper() party_text = name_match.group(3) party = PARTIES[party_text] info_texts = [x.strip() for x in item.xpath( './td[2]/p/text()[normalize-space() and preceding-sibling::br]' ) if x.strip()] address = '\n'.join((info_texts[0], info_texts[1])) phone_text = info_texts[2] if validate_phone_number(phone_text): phone = phone_text email_node = info_nodes[1] email_text = email_node.text email_text = email_text.replace('Email: ', '').strip() if validate_email_address(email_text): email = email_text rep = Person(name=name, district=district, party=party, primary_org='lower', role='Representative', image=photo_url) rep.add_link(url) rep.add_contact_detail(type='address', value=address) rep.add_contact_detail(type='voice', value=phone) rep.add_contact_detail(type='email', value=email) rep.add_source(self.url) yield rep
def _scrape_lower_chamber(self, term): url = 'http://www.house.leg.state.mn.us/members/hmem.asp' page = self.lxmlize(url) legislator_nodes = self.get_nodes( page, '//div[@id="hide_show_alpha_all"]/table/tr/td/table/tr') for legislator_node in legislator_nodes: photo_url = self.get_node( legislator_node, './td[1]/a/img/@src') info_nodes = self.get_nodes( legislator_node, './td[2]/p/a') name_text = self.get_node( info_nodes[0], './b/text()') name_match = re.search(r'^.+\(', name_text) name = name_match.group(0) name = name.replace('(', '').strip() district_match = re.search(r'\([0-9]{2}[A-Z]', name_text) district_text = district_match.group(0) district = district_text.replace('(', '').lstrip('0').strip() party_match = re.search(r'[A-Z]+\)$', name_text) party_text = party_match.group(0) party_text = party_text.replace(')', '').strip() party = self._parties[party_text] info_texts = self.get_nodes( legislator_node, './td[2]/p/text()[normalize-space() and preceding-sibling' '::br]') address = '\n'.join((info_texts[0], info_texts[1])) phone_text = info_texts[2] if validate_phone_number(phone_text): phone = phone_text email_node = info_nodes[1] email_text = email_node.text email_text = email_text.replace('Email: ', '').strip() if validate_email_address(email_text): email = email_text legislator = Legislator( term=term, chamber='lower', district=district, full_name=name, party=party, email=email, photo_url=photo_url, ) legislator.add_source(url) legislator.add_office( type='capitol', name="Capitol Office", address=address, phone=phone, email=email, ) self.save_legislator(legislator)
def scrape_lower_chamber(self, term): url = 'http://www.house.leg.state.mn.us/members/hmem.asp' page = self.lxmlize(url) legislator_nodes = self.get_nodes( page, '//div[@id="hide_show_alpha_all"]/table/tr/td/table/tr') need_special_email_case = False for legislator_node in legislator_nodes: photo_url = self.get_node( legislator_node, './td[1]/a/img/@src') info_nodes = self.get_nodes( legislator_node, './td[2]/p/a') name_text = self.get_node( info_nodes[0], './b/text()') name_match = re.search(r'^.+\(', name_text) name = name_match.group(0) name = name.replace('(', '').strip() district_match = re.search(r'\([0-9]{2}[A-Z]', name_text) district_text = district_match.group(0) district = district_text.replace('(', '').lstrip('0').strip() party_match = re.search(r'[A-Z]+\)$', name_text) party_text = party_match.group(0) party_text = party_text.replace(')', '').strip() party = self._parties[party_text] info_texts = self.get_nodes( legislator_node, './td[2]/p/text()[normalize-space() and preceding-sibling' '::br]') address = '\n'.join((info_texts[0], info_texts[1])) phone_text = info_texts[2] if self._validate_phone_number(phone_text): phone = phone_text # E-mail markup is screwed-up and inconsistent. try: email_node = info_nodes[1] email_text = email_node.text except IndexError: # Primarily for Dan Fabian. email_node = info_texts[3] need_special_email_case = True email_text = email_text.replace('Email: ', '').strip() if validate_email_address(email_text): email = email_text legislator = Legislator( term=term, chamber='lower', district=district, full_name=name, party=party, email=email, photo_url=photo_url, ) legislator.add_source(url) legislator.add_office( type='capitol', name="Capitol Office", address=address, phone=phone, email=email, ) self.save_legislator(legislator) if not need_special_email_case: self.logger.warning('Special e-mail handling no longer required.')
def _scrape_lower_chamber(self, term): url = 'http://www.house.leg.state.mn.us/members/hmem.asp' page = self.lxmlize(url) legislator_nodes = self.get_nodes( page, '//div[@id="hide_show_alpha_all"]/table/tr/td/table/tr') need_special_email_case = False for legislator_node in legislator_nodes: photo_url = self.get_node(legislator_node, './td[1]/a/img/@src') info_nodes = self.get_nodes(legislator_node, './td[2]/p/a') name_text = self.get_node(info_nodes[0], './b/text()') name_match = re.search(r'^.+\(', name_text) name = name_match.group(0) name = name.replace('(', '').strip() district_match = re.search(r'\([0-9]{2}[A-Z]', name_text) district_text = district_match.group(0) district = district_text.replace('(', '').lstrip('0').strip() party_match = re.search(r'[A-Z]+\)$', name_text) party_text = party_match.group(0) party_text = party_text.replace(')', '').strip() party = self._parties[party_text] info_texts = self.get_nodes( legislator_node, './td[2]/p/text()[normalize-space() and preceding-sibling' '::br]') address = '\n'.join((info_texts[0], info_texts[1])) phone_text = info_texts[2] if validate_phone_number(phone_text): phone = phone_text # E-mail markup is screwed-up and inconsistent. try: email_node = info_nodes[1] email_text = email_node.text except IndexError: # Primarily for Dan Fabian. email_node = info_texts[3] need_special_email_case = True email_text = email_text.replace('Email: ', '').strip() if validate_email_address(email_text): email = email_text legislator = Legislator( term=term, chamber='lower', district=district, full_name=name, party=party, email=email, photo_url=photo_url, ) legislator.add_source(url) legislator.add_office( type='capitol', name="Capitol Office", address=address, phone=phone, email=email, ) self.save_legislator(legislator) if not need_special_email_case: self.logger.warning('Special e-mail handling no longer required.')