Beispiel #1
0
    def _scrape_upper_chamber(self, term):
        index_url = 'http://www.senate.mn/members/index.php'
        doc = lxml.html.fromstring(self.get(index_url).text)
        doc.make_links_absolute(index_url)

        leg_data = defaultdict(dict)

        # get all the tds in a certain div
        tds = doc.xpath('//div[@id="hide_show_alpha_all"]//td[@style="vertical-align:top;"]')
        for td in tds:
            # each td has 2 <a>s- site & email
            main_link, email = td.xpath('.//a')
            # get name
            name = main_link.text_content().split(' (')[0]
            leg = leg_data[name]
            leg['office_phone'] = filter(
                lambda string: re.match(r'\d{3}-\d{3}-\d{4}', string),
                td.xpath('.//p/text()'))[0].strip()
            leg['url'] = main_link.get('href')
            leg['photo_url'] = td.xpath('./preceding-sibling::td//img/@src')[0]
            if 'mailto:' in email.get('href'):
                leg['email'] = email.get('href').replace('mailto:', '')

        self.info('collected preliminary data on %s legislators', len(leg_data))
        assert leg_data

        # use CSV for most of data
        csv_url = 'http://www.senate.mn/members/member_list_ascii.php?ls='
        csvfile = self.get(csv_url).text

        for row in csv.DictReader(StringIO(csvfile)):
            if not row['First Name']:
                continue
            name = '%s %s' % (row['First Name'], row['Last Name'])
            party = self._parties[row['Party']]
            leg_data[name]
            if 'email' in leg_data[name]:
                email = leg_data[name].pop('email')
            else:
                email = None
            leg = Legislator(term, 'upper', row['District'].lstrip('0'), name,
                             party=party,
                             first_name=row['First Name'],
                             last_name=row['Last Name'],
                             **leg_data[name]
                            )
            row["Zipcode"] = row["Zipcode"].strip()

            # Accommodate for multiple address column naming conventions.
            address1_fields = [row.get('Address'), row.get('Office Building')]
            address2_fields = [row.get('Address2'), row.get('Office Address')]
            row['Address'] = next((a for a in address1_fields if a is not
                None), False)
            row['Address2'] = next((a for a in address2_fields if a is not
                None), False)

            if (a in row['Address2'] for a in ['95 University Avenue W', '100 Rev. Dr. Martin Luther King']):
                address = '{Address}\n{Address2}\n{City}, {State} {Zipcode}'.format(**row)
                if 'Rm. Number' in row:
                    address = '{0} {1}'.format(row['Rm. Number'], address)
                leg.add_office('capitol', 'Capitol Office',
                    address=address,
                    email=email, phone=leg.get('office_phone'))
            elif row['Address2']:
                leg.add_office('district', 'District Office',
                    address='{Address}\n{Address2}\n{City}, {State} {Zipcode}'.format(**row),
                    email=email)
            else:
                leg.add_office('district', 'District Office',
                    address='{Address}\n{City}, {State} {Zipcode}'.format(**row),
                    email=email)

            leg.add_source(csv_url)
            leg.add_source(index_url)

            self.save_legislator(leg)
Beispiel #2
0
    def _scrape_upper_chamber(self, term):
        index_url = 'http://www.senate.mn/members/index.php'
        doc = lxml.html.fromstring(self.get(index_url).text)
        doc.make_links_absolute(index_url)

        leg_data = defaultdict(dict)

        # get all the tds in a certain div
        tds = doc.xpath('//div[@id="hide_show_alpha_all"]//td[@style="vertical-align:top;"]')
        for td in tds:
            # each td has 2 <a>s- site & email
            main_link, email = td.xpath('.//a')
            # get name
            name = main_link.text_content().split(' (')[0]
            leg = leg_data[name]
            leg['office_phone'] = filter(
                lambda string: re.match(r'\d{3}-\d{3}-\d{4}', string),
                td.xpath('.//p/text()'))[0].strip()
            leg['url'] = main_link.get('href')
            leg['photo_url'] = td.xpath('./preceding-sibling::td//img/@src')[0]
            if 'mailto:' in email.get('href'):
                leg['email'] = email.get('href').replace('mailto:', '')

        self.info('collected preliminary data on %s legislators', len(leg_data))
        assert leg_data

        # use CSV for most of data
        csv_url = 'http://www.senate.mn/members/member_list_ascii.php?ls='
        csvfile = self.get(csv_url).text

        for row in csv.DictReader(StringIO(csvfile)):
            if not row['First Name']:
                continue
            name = '%s %s' % (row['First Name'], row['Last Name'])
            party = self._parties[row['Party']]
            leg_data[name]
            if 'email' in leg_data[name]:
                email = leg_data[name].pop('email')
            else:
                email = None
            leg = Legislator(term, 'upper', row['District'].lstrip('0'), name,
                             party=party,
                             first_name=row['First Name'],
                             last_name=row['Last Name'],
                             **leg_data[name]
                            )
            row["Zipcode"] = row["Zipcode"].strip()

            # Accommodate for multiple address column naming conventions.
            address1_fields = [row.get('Address'), row.get('Office Building')]
            address2_fields = [row.get('Address2'), row.get('Office Address')]
            row['Address'] = next((a for a in address1_fields if a is not
                None), False)
            row['Address2'] = next((a for a in address2_fields if a is not
                None), False)

            if (a in row['Address2'] for a in ['95 University Avenue W',
                '100 Rev. Dr. Martin Luther King']):
                leg.add_office('capitol', 'Capitol Office',
                    address='{Room} {Address}\n{Address2}\n{City}, {State} '\
                        '{Zipcode}'.format(Room=row['Rm. Number'], **row),
                    email=email, phone=leg.get('office_phone'))
            elif row['Address2']:
                leg.add_office('district', 'District Office',
                    address='{Address}\n{Address2}\n{City}, {State} {Zipcode}'.format(**row),
                    email=email)
            else:
                leg.add_office('district', 'District Office',
                    address='{Address}\n{City}, {State} {Zipcode}'.format(**row),
                    email=email)

            leg.add_source(csv_url)
            leg.add_source(index_url)

            self.save_legislator(leg)