def scrape(self): for page in self.eventPages(EVENTSPAGE): events_table = page.xpath("//table[@class='rgMasterTable']")[0] for events, headers, rows in self.parseDataTable(events_table) : print(events) location_string = events[u'Meeting\xa0Location'] location_list = location_string.split('--') location = ', '.join(location_list[0:2]) status_string = location_list[-1].split('Chicago, Illinois') if len(status_string) > 1 and status_string[1] : status = status_string[1].lower() if status not in ['cancelled', 'tentative', 'confirmed', 'passed'] : print(status) status = 'confirmed' else : status = 'confirmed' when = events[u'Meeting\xa0Date'] time_string = events[u'Meeting\xa0Time'] event_time = datetime.datetime.strptime(time_string, "%I:%M %p") when = when.replace(hour=event_time.hour) e = Event(name=events["Name"]["label"], when=when, location=location, status=status) e.add_source(EVENTSPAGE) if events['Video'] != u'Not\xa0available' : print(events['Video']) yield e
def scrape_meetings(self, meetings, group): """ Scrape and save event data from a list of meetings. Arguments: meetings -- A list of lxml elements containing event information group -- The type of meeting. The legislature site applies different formatting to events based on which group they correspond to. `group` should be one of the following strings: 'house', 'senate', or 'commission'. """ for meeting in meetings: when = self.get_date(meeting) description = self.get_description(meeting) location = self.get_location(meeting) if when and description and location: event = Event(name=description, start_date=when.replace(tzinfo=self.tz), description=description, location_name=location) agenda = self.get_agenda(meeting) if agenda: event.add_agenda_item(agenda) event.add_source(url) yield event
def scrape(self): for page in self.eventPages(EVENTSPAGE): events_table = page.xpath("//table[@class='rgMasterTable']")[0] for events, headers, rows in self.parseDataTable(events_table): print(events) location_string = events[u'Meeting\xa0Location'] location_list = location_string.split('--') location = ', '.join(location_list[0:2]) status_string = location_list[-1].split('Chicago, Illinois') if len(status_string) > 1 and status_string[1]: status = status_string[1].lower() if status not in [ 'cancelled', 'tentative', 'confirmed', 'passed' ]: print(status) status = 'confirmed' else: status = 'confirmed' when = events[u'Meeting\xa0Date'] time_string = events[u'Meeting\xa0Time'] event_time = datetime.datetime.strptime( time_string, "%I:%M %p") when = when.replace(hour=event_time.hour) e = Event(name=events["Name"]["label"], when=when, location=location, status=status) e.add_source(EVENTSPAGE) if events['Video'] != u'Not\xa0available': print(events['Video']) yield e
def scrape(self): EVENTS_URL = 'http://www.legislature.state.al.us/aliswww/ISD/InterimMeetings.aspx' rows = self.lxmlize(EVENTS_URL).xpath( '//table[@id="ContentPlaceHolder1_gvInterimMeeting"]/tr') for row in rows[1:]: date = row.xpath('td')[0].text_content().strip() time = row.xpath('td')[1].text_content().strip() date_with_time = '{} {}'.format(date, time) location = row.xpath('td')[2].text_content().strip() # 11 South Union Street, Montgomery, Alabama, United States # TODO: IF location is "room (X)" add state house # TODO: REplace "state house" with address # 32°22′37.294″N 86°17′57.991″W # host = row.xpath('td')[3].text_content().strip() name = row.xpath('td')[3].text_content().strip() details = row.xpath('td')[4].text_content().strip() event = Event(start_date=self._TZ.localize( datetime.datetime.strptime( date_with_time, self._DATETIME_FORMAT, )), name=name, location_name=location, description=details) event.add_source(EVENTS_URL) yield event
def scrape_committee_events(self, code, name): events_url = \ 'http://www.cga.ct.gov/basin/fullcalendar/commevents.php?' \ 'comm_code={}'.format(code) events_data = self.get(events_url).text events = json.loads(events_data) DATETIME_FORMAT = '%Y-%m-%dT%H:%M:%SZ' for info in events: if info['title'] is None: self.warning("Event found with no title; it will be skipped") continue elif info['title'].startswith('CANCELLED:'): self.info( "Cancelled event found; it will be skipped: {}".format( info['title'])) continue when = datetime.datetime.strptime(info['start'], DATETIME_FORMAT) # end = datetime.datetime.strptime(info['end'], DATETIME_FORMAT) where = "{0} {1}".format(info['building'].strip(), info['location'].strip()) # end_time=self._tz.localize(end), event = Event( start_time=self._tz.localize(when), timezone=self._tz.zone, location_name=where, name=info['title'], description=info['title'], ) event.add_source(events_url) yield event
def scrape_meeting_notice(self, chamber, item, url): # Since Event Name is not provided for all mettings. event_name = str(item['CommitteeName']) # 04/25/2012 03:00:00 PM fmt = "%m/%d/%y %I:%M %p" start_time = dt.datetime.strptime(str(item['MeetingDateTime']), fmt) location_name = str(item['AddressAliasNickname']) event = Event(location_name=location_name, start_date=self._tz.localize(start_time), name=event_name, description='Committee Meeting Status: {}' .format(item['CommitteeMeetingStatusName']) ) event.add_source(url) event.add_committee(name=str(item['CommitteeName']), id=item['CommitteeId']) page_url = ("http://legis.delaware.gov/json/MeetingNotice/" "GetCommitteeMeetingItems?committeeMeetingId={}".format( item['CommitteeMeetingId']) ) event.add_source(page_url) page_data = self.post(page_url).json()['Data'] for item in page_data: event.add_agenda_item(description=str(item['ItemDescription'])) event.add_person(name=str(item['PrimarySponsorShortName']), id=str(item['PrimarySponsorPersonId']), note='Sponsor') yield event
def scrape_upper(self): url = "http://www.oksenate.gov/Committees/meetingnotices.htm" page = lxml.html.fromstring(self.get(url).text) page.make_links_absolute(url) text = page.text_content() _, text = text.split('MEETING NOTICES') re_date = r'[A-Z][a-z]+,\s+[A-Z][a-z]+ \d+, \d{4}' chunks = zip(re.finditer(re_date, text), re.split(re_date, text)[1:]) for match, data in chunks: when = match.group() when = datetime.datetime.strptime(when, "%A, %B %d, %Y") lines = filter(None, [x.strip() for x in data.splitlines()]) time_ = re.search(r'^\s*TIME:\s+(.+?)\s+\x96', data, re.M).group(1) time_ = time_.replace('a.m.', 'AM').replace('p.m.', 'PM') time_ = time.strptime(time_, '%I:%M %p') when += datetime.timedelta(hours=time_.tm_hour, minutes=time_.tm_min) title = lines[0] where = re.search(r'^\s*PLACE:\s+(.+)', data, re.M).group(1) where = where.strip() event = Event(name=title, start_date=self._tz.localize(when), location_name=where) event.add_source(url) yield event
def scrape_meeting_notice(self, chamber, item, url): # Since Event Name is not provided for all mettings. event_name = str(item['CommitteeName']) # 04/25/2012 03:00:00 PM fmt = "%m/%d/%y %I:%M %p" start_time = dt.datetime.strptime(str(item['MeetingDateTime']), fmt) location_name = str(item['AddressAliasNickname']) event = Event(location_name=location_name, start_date=self._tz.localize(start_time), name=event_name, description='Committee Meeting Status: {}'.format( item['CommitteeMeetingStatusName'])) event.add_source(url) event.add_committee(name=str(item['CommitteeName']), id=item['CommitteeId']) page_url = ("http://legis.delaware.gov/json/MeetingNotice/" "GetCommitteeMeetingItems?committeeMeetingId={}".format( item['CommitteeMeetingId'])) event.add_source(page_url) page_data = self.post(page_url).json()['Data'] for item in page_data: event.add_agenda_item(description=str(item['ItemDescription'])) event.add_person(name=str(item['PrimarySponsorShortName']), id=str(item['PrimarySponsorPersonId']), note='Sponsor') yield event
def scrape_meetings(self, meetings, group): """ Scrape and save event data from a list of meetings. Arguments: meetings -- A list of lxml elements containing event information group -- The type of meeting. The legislature site applies different formatting to events based on which group they correspond to. `group` should be one of the following strings: 'house', 'senate', or 'commission'. """ for meeting in meetings: when = self.get_date(meeting) description = self.get_description(meeting) location = self.get_location(meeting) if when and description and location: event = Event(name=description, start_date=when.replace(tzinfo=self.tz), description=description, location_name=location) agenda = self.get_agenda(meeting) if agenda: event.add_agenda_item(agenda) event.add_source(url) yield event
def test_no_location(): e = Event( name="get-together", start_date=datetime.datetime.utcnow().isoformat().split('.')[0] + 'Z', ) e.add_source(url='http://example.com/foobar') e.validate()
def scrape_upper(self): url = "http://www.oksenate.gov/Committees/meetingnotices.htm" page = lxml.html.fromstring(self.get(url).text) page.make_links_absolute(url) text = page.text_content() _, text = text.split('MEETING NOTICES') re_date = r'[A-Z][a-z]+,\s+[A-Z][a-z]+ \d+, \d{4}' chunks = zip(re.finditer(re_date, text), re.split(re_date, text)[1:]) for match, data in chunks: when = match.group() when = datetime.datetime.strptime(when, "%A, %B %d, %Y") lines = filter(None, [x.strip() for x in data.splitlines()]) time_ = re.search(r'^\s*TIME:\s+(.+?)\s+\x96', data, re.M).group(1) time_ = time_.replace('a.m.', 'AM').replace('p.m.', 'PM') time_ = time.strptime(time_, '%I:%M %p') when += datetime.timedelta(hours=time_.tm_hour, minutes=time_.tm_min) title = lines[0] where = re.search(r'^\s*PLACE:\s+(.+)', data, re.M).group(1) where = where.strip() event = Event(name=title, start_date=self._tz.localize(when), location_name=where) event.add_source(url) yield event
def scrape_committee_events(self, code, name): events_url = \ 'http://www.cga.ct.gov/basin/fullcalendar/commevents.php?' \ 'comm_code={}'.format(code) events_data = self.get(events_url).text events = json.loads(events_data) DATETIME_FORMAT = '%Y-%m-%dT%H:%M:%SZ' for info in events: if info['title'] is None: self.warning("Event found with no title; it will be skipped") continue elif info['title'].startswith('CANCELLED:'): self.info("Cancelled event found; it will be skipped: {}". format(info['title'])) continue when = datetime.datetime.strptime(info['start'], DATETIME_FORMAT) # end = datetime.datetime.strptime(info['end'], DATETIME_FORMAT) where = "{0} {1}".format(info['building'].strip(), info['location'].strip()) # end_time=self._tz.localize(end), event = Event(start_time=self._tz.localize(when), timezone=self._tz.zone, location_name=where, name=info['title'], description=info['title'],) event.add_source(events_url) yield event
def parse_div(self, row, chamber, com): cal_link = row.xpath('.//a[.//span[@id="calendarmarker"]]/@href')[0] # event_date = row.xpath('string(.//div[contains(@class,"ItemDate")])').strip() title, location, start_date, end_date = self.parse_gcal(cal_link) event = Event( start_date=start_date, end_date=end_date, name=title, location_name=location, ) event.add_source('http://mgaleg.maryland.gov/webmga/frmHearingSchedule.aspx') for item in row.xpath('.//div[@class="col-xs-12a Item"]'): description = item.xpath('string(.)').strip() agenda = event.add_agenda_item(description=description) for item in row.xpath('.//div[contains(@class,"ItemContainer")]/a'): description = item.xpath('string(.)').strip() agenda = event.add_agenda_item(description=description) event.add_document( description, item.xpath('@href')[0], media_type="application/pdf", on_duplicate="ignore" ) for item in row.xpath('.//div[contains(@class,"ItemContainer")]' '[./div[@class="col-xs-1 Item"]]'): description = item.xpath('string(.)').strip() agenda = event.add_agenda_item(description=description) bill = item.xpath('.//div[@class="col-xs-1 Item"]/a/text()')[0].strip() agenda.add_bill(bill) video = row.xpath('.//a[./span[@class="OnDemand"]]') if video: event.add_media_link( 'Video of Hearing', video[0].xpath('@href')[0], 'text/html' ) if 'subcommittee' in title.lower(): subcom = title.split('-')[0].strip() event.add_participant( subcom, type='committee', note='host', ) else: event.add_participant( com, type='committee', note='host', ) yield event
def scrape(self): page = self.lxmlize(PAGE) events = page.xpath("//div[@class='col-middle']//ul/li") when = None for event in events: h3 = event.xpath("./a/h2") h3 = h3[0] if h3 else None if h3 is not None: when = h3.text else: if when is None: self.warning("Ungrok!") continue b, _, i = event.xpath("./p/*") title = b.text_content() event = i.text_content() if "NO MEETING" in event: continue day, title = (x.strip() for x in title.split("-", 1)) where = "Council Chambers" for subevent in (x.strip() for x in event.split(";")): if " in " in subevent: subevent, where = subevent.rsplit(" in ", 1) subevent = subevent.replace(u'\xa0', ' ') if "NO" in subevent and "MEETING" in subevent: continue if "to follow" in subevent: continue info = EVENT_RE.match(subevent).groupdict() event, time = [info[x] for x in ['event', 'time']] ampm = { "a.m.": "AM", "p.m.": "PM", } for old, new in ampm.items(): time = time.replace(old, new) dtstring = ", ".join([day, time]) try: etime = dt.datetime.strptime(dtstring, "%m/%d/%Y, %I:%M %p") except ValueError: etime = dt.datetime.strptime(dtstring, "%m/%d/%Y, %I%p") e = Event(name=event, when=etime, location=where) e.add_source(PAGE) yield e
def event_obj(): e = Event( name="get-together", start_date=datetime.datetime.utcnow().isoformat().split('.')[0] + 'Z', location_name="Joe's Place", ) e.add_source(url='http://example.com/foobar') return e
def event_obj(): e = Event( name="get-together", start_date=datetime.datetime.utcnow().isoformat().split('.')[0] + 'Z', location_name="Joe's Place", ) e.add_source(url='http://example.com/foobar') return e
def scrape_event_page(self, url, event_type): page = self.lxmlize(url) page.make_links_absolute('https://malegislature.gov/') title = page.xpath('string(//div[contains(@class,"followable")]/h1)') title = title.replace('Hearing Details', '').strip() title = title.replace('Special Event Details', '') start_day = page.xpath( 'string(//dl[contains(@class,"eventInformation")]/dd[2])').strip() start_time = page.xpath( 'string(//dl[contains(@class,"eventInformation")]/dd[3])').strip() location = page.xpath( 'string(//dl[contains(@class,"eventInformation")]/dd[4]//a)' ).strip() description = page.xpath( 'string(//dl[contains(@class,"eventInformation")]/dd[5])').strip() start_date = self._TZ.localize( dateutil.parser.parse('{} {}'.format(start_day, start_time), )) event = Event(start_date=start_date, name=title, location_name=location, description=description) event.add_source(url) agenda_rows = page.xpath( '//div[contains(@class,"col-sm-8") and .//h2[contains(@class,"agendaHeader")]]' '/div/div/div[contains(@class,"panel-default")]') for row in agenda_rows: # only select the text node, not the spans agenda_title = row.xpath( 'string(.//h4/a/text()[normalize-space()])').strip() if agenda_title == '': agenda_title = row.xpath( 'string(.//h4/text()[normalize-space()])').strip() agenda = event.add_agenda_item(description=agenda_title) bills = row.xpath('.//tbody/tr/td[1]/a/text()') for bill in bills: bill = bill.strip().replace('.', ' ') agenda.add_bill(bill) if event_type == 'Hearing': event.add_participant( title, type='committee', note='host', ) yield event
def scrape_house_weekly_schedule(self): url = "http://house.louisiana.gov/H_Sched/Hse_MeetingSchedule.aspx" page = self.lxmlize(url) meeting_rows = page.xpath('//table[@id = "table229"]/tr') valid_meetings = [row for row in meeting_rows if row.xpath( './td[1]')[0].text_content().replace(u'\xa0', '') and row.xpath( './td/a/img[contains(@src, "PDF-AGENDA.png")]') and 'Not Meeting' not in row.xpath( './td[2]')[0].text_content()] for meeting in valid_meetings: try: guid = meeting.xpath('./td/a[descendant::img[contains(@src,' '"PDF-AGENDA.png")]]/@href')[0] # self.logger.debug(guid) self.warning("logger.debug" + guid) except KeyError: continue # Sometimes we have a dead link. This is only on # dead entries. committee_name = meeting.xpath('./td[1]/text()')[0].strip() meeting_string = meeting.xpath('./td[2]')[0].text_content() if "@" in meeting_string: continue # Contains no time data. date, time, location = ([s.strip() for s in meeting_string.split( ',') if s] + [None]*3)[:3] # check for time in date because of missing comma time_srch = re.search('\d{2}:\d{2} (AM|PM)', date) if time_srch: location = time time = time_srch.group() date = date.replace(time, '') # self.logger.debug(location) self.warning("logger.debug" + location) year = datetime.datetime.now().year datetime_string = ' '.join((date, str(year), time)) when = datetime.datetime.strptime(datetime_string, '%b %d %Y %I:%M %p') when = self._tz.localize(when) description = 'Committee Meeting: {}'.format(committee_name) # self.logger.debug(description) self.warning("logger.debug" + description) event = Event(name=description, start_date=self._tz.localize(when), location_name=location) event.add_source(url) event.add_participant(committee_name, type='committee', note='host') event.add_document(note='Agenda', url=guid, text='agenda', media_type='application/pdf') yield event
def scrape_house_weekly_schedule(self): url = "http://house.louisiana.gov/H_Sched/Hse_MeetingSchedule.aspx" page = self.lxmlize(url) meeting_rows = page.xpath('//table[@id = "table229"]/tr') valid_meetings = [row for row in meeting_rows if row.xpath( './td[1]')[0].text_content().replace(u'\xa0', '') and row.xpath( './td/a/img[contains(@src, "PDF-AGENDA.png")]') and 'Not Meeting' not in row.xpath( './td[2]')[0].text_content()] for meeting in valid_meetings: try: guid = meeting.xpath('./td/a[descendant::img[contains(@src,' '"PDF-AGENDA.png")]]/@href')[0] # self.logger.debug(guid) self.warning("logger.debug" + guid) except KeyError: continue # Sometimes we have a dead link. This is only on # dead entries. committee_name = meeting.xpath('./td[1]/text()')[0].strip() meeting_string = meeting.xpath('./td[2]')[0].text_content() if "@" in meeting_string: continue # Contains no time data. date, time, location = ([s.strip() for s in meeting_string.split( ',') if s] + [None]*3)[:3] # check for time in date because of missing comma time_srch = re.search(r'\d{2}:\d{2} (AM|PM)', date) if time_srch: location = time time = time_srch.group() date = date.replace(time, '') # self.logger.debug(location) self.warning("logger.debug" + location) year = datetime.datetime.now().year datetime_string = ' '.join((date, str(year), time)) when = datetime.datetime.strptime(datetime_string, '%b %d %Y %I:%M %p') when = self._tz.localize(when) description = 'Committee Meeting: {}'.format(committee_name) # self.logger.debug(description) self.warning("logger.debug" + description) event = Event(name=description, start_date=self._tz.localize(when), location_name=location) event.add_source(url) event.add_participant(committee_name, type='committee', note='host') event.add_document(note='Agenda', url=guid, text='agenda', media_type='application/pdf') yield event
def scrape(self): local_timezone = pytz.timezone("US/Eastern") base_calendar_url = "http://www.miamidade.gov/cob/county-commission-calendar.asp" #things get messy more than a few months out #so we're just pulling 3 months. If we want three #more, they are called "nxx", "nxy" and "nxz" months = ["cur","nex","nxw"] for m in months: doc = self.lxmlize(base_calendar_url + "?next={}".format(m)) events = doc.xpath("//table[contains(@style,'dotted #ccc')]") for event in events: rows = event.xpath(".//tr") for row in rows: heading, data = row.xpath(".//td") h = heading.text_content().lower().replace(":","").strip() if h == "event": title = data.text_content() link = data.xpath(".//a")[0].attrib["href"] elif h == "event date": when = datetime.strptime(data.text, '%m/%d/%y %H:%M%p') when = local_timezone.localize(when) elif h == "location": where = data.text elif h == "description": description = data.text if link in DUPLICATE_EVENT_URLS: continue if title == "Mayor's FY 2016-17 Proposed Budget Public Meeting": continue if not description: description = "" status = "confirmed" if "cancelled" in title.lower(): status = "cancelled" e = Event(name=title, start_time=when, timezone="US/Eastern", location_name=where, description=description, status=status) e.add_source(link) yield e e = Event(name="Mayor's FY 2016-17 Proposed Budget Public Meeting", start_time=local_timezone.localize(datetime.strptime('08/08/16 06:00PM', '%m/%d/%y %H:%M%p')), timezone="US/Eastern", location_name='111 NW 1st Street', description='Pursuant to Section 2-1800A of the County Code, a Public Meeting has been scheduled by the Honorable Carlos A. Gimenez, Mayor, Miami-Dade County, to discuss the FY 2016-17 budget, tax rates, and fee changes.', status='confirmed') e.add_source('http://miamidade.gov/wps/Events/EventDetail.jsp?eventID=447192') yield e
def event_obj(): e = Event( name="get-together", start_time=datetime.datetime.utcnow(), location_name="Joe's Place", timezone="America/New_York", ) e.add_source(url='foobar') return e
def scrape_upper(self): listing_url = 'https://www.senate.mo.gov/hearingsschedule/hrings.htm' html = self.get(listing_url).text # The HTML here isn't wrapped in a container per-event # which makes xpath a pain. So string split by <hr> # then parse each event's fragment for cleaner results for fragment in html.split('<hr />')[1:]: page = lxml.html.fromstring(fragment) when_date = self.row_content(page, 'Date:') when_time = self.row_content(page, 'Time:') location = self.row_content(page, 'Room:') location = '{}, {}'.format( location, '201 W Capitol Ave, Jefferson City, MO 65101') # com = self.row_content(page, 'Committee:') com = page.xpath( '//td[descendant::b[contains(text(),"Committee")]]/a/text()' )[0] com = com.split(', Senator')[0].strip() start_date = self._TZ.localize( dateutil.parser.parse('{} {}'.format(when_date, when_time))) event = Event(start_date=start_date, name=com, location_name=location) event.add_source(listing_url) event.add_participant( com, type='committee', note='host', ) for bill_table in page.xpath( '//table[@width="85%" and @border="0"]'): bill_link = '' if bill_table.xpath(self.bill_link_xpath): agenda_line = bill_table.xpath('string(tr[2])').strip() agenda_item = event.add_agenda_item( description=agenda_line) bill_link = bill_table.xpath( self.bill_link_xpath)[0].strip() agenda_item.add_bill(bill_link) else: agenda_line = bill_table.xpath('string(tr[1])').strip() agenda_item = event.add_agenda_item( description=agenda_line) yield event
def parse_div(self, row, chamber, com): cal_link = row.xpath('.//a[.//span[@id="calendarmarker"]]/@href')[0] # event_date = row.xpath('string(.//div[contains(@class,"ItemDate")])').strip() title, location, start_date, end_date = self.parse_gcal(cal_link) event = Event( start_date=start_date, end_date=end_date, name=title, location_name=location, ) event.add_source( 'http://mgaleg.maryland.gov/webmga/frmHearingSchedule.aspx') for item in row.xpath('.//div[@class="col-xs-12a Item"]'): description = item.xpath('string(.)').strip() agenda = event.add_agenda_item(description=description) for item in row.xpath('.//div[contains(@class,"ItemContainer")]/a'): description = item.xpath('string(.)').strip() agenda = event.add_agenda_item(description=description) event.add_document(description, item.xpath('@href')[0], media_type="application/pdf", on_duplicate="ignore") for item in row.xpath('.//div[contains(@class,"ItemContainer")]' '[./div[@class="col-xs-1 Item"]]'): description = item.xpath('string(.)').strip() agenda = event.add_agenda_item(description=description) bill = item.xpath( './/div[@class="col-xs-1 Item"]/a/text()')[0].strip() agenda.add_bill(bill) video = row.xpath('.//a[./span[@class="OnDemand"]]') if video: event.add_media_link('Video of Hearing', video[0].xpath('@href')[0], 'text/html') if 'subcommittee' in title.lower(): subcom = title.split('-')[0].strip() event.add_participant( subcom, type='committee', note='host', ) else: event.add_participant( com, type='committee', note='host', ) yield event
def scrape_event_page(self, url, event_type): page = self.lxmlize(url) page.make_links_absolute('https://malegislature.gov/') title = page.xpath('string(//div[contains(@class,"followable")]/h1)') title = title.replace('Hearing Details', '').strip() title = title.replace('Special Event Details', '') start_day = page.xpath('string(//dl[contains(@class,"eventInformation")]/dd[2])').strip() start_time = page.xpath('string(//dl[contains(@class,"eventInformation")]/dd[3])').strip() location = page.xpath('string(//dl[contains(@class,"eventInformation")]/dd[4]//a)').strip() description = page.xpath('string(//dl[contains(@class,"eventInformation")]/dd[5])').strip() start_date = self._TZ.localize( dateutil.parser.parse( '{} {}'.format(start_day, start_time), ) ) event = Event( start_date=start_date, name=title, location_name=location, description=description ) event.add_source(url) agenda_rows = page.xpath( '//div[contains(@class,"col-sm-8") and .//h2[contains(@class,"agendaHeader")]]' '/div/div/div[contains(@class,"panel-default")]') for row in agenda_rows: # only select the text node, not the spans agenda_title = row.xpath('string(.//h4/a/text()[normalize-space()])').strip() if agenda_title == '': agenda_title = row.xpath('string(.//h4/text()[normalize-space()])').strip() agenda = event.add_agenda_item(description=agenda_title) bills = row.xpath('.//tbody/tr/td[1]/a/text()') for bill in bills: bill = bill.strip().replace('.', ' ') agenda.add_bill(bill) if event_type == 'Hearing': event.add_participant( title, type='committee', note='host', ) yield event
def scrape(self, chamber=None): URL = "http://utahlegislature.granicus.com/ViewPublisherRSS.php?view_id=2&mode=agendas" doc = self.lxmlize(URL) events = doc.xpath("//item") for info in events: title_and_date = info.xpath("title/text()")[0].split(" - ") title = title_and_date[0] when = title_and_date[-1] # if not when.endswith(session[ :len("20XX")]): # continue event = Event( name=title, start_date=self._tz.localize( datetime.datetime.strptime(when, "%b %d, %Y")), location_name="State Capitol", ) event.add_source(URL) url = re.search(r"(http://.*?)\s", info.text_content()).group(1) try: doc = self.lxmlize(url) except HTTPError: self.logger.warning("Page missing, skipping") continue event.add_source(url) committee = doc.xpath('//a[text()="View committee page"]/@href') if committee: committee_doc = self.lxmlize(committee[0]) committee_name = committee_doc.xpath( '//h3[@class="heading committee"]/text()')[0].strip() event.add_participant(committee_name, type="committee", note="host") documents = doc.xpath(".//td") for document in documents: url = re.search(r"(http://.*?pdf)", document.xpath("@onclick")[0]) if url is None: continue url = url.group(1) event.add_document( note=document.xpath("text()")[0], url=url, media_type="application/pdf", ) bills = document.xpath("@onclick") for bill in bills: if "bills/static" in bill: bill_name = bill.split("/")[-1].split(".")[0] item = event.add_agenda_item("Bill up for discussion") item.add_bill(bill_name) yield event
def scrape(self): tz = pytz.timezone("US/Eastern") get_short_codes(self) page = self.lxmlize(URL) table = page.xpath( "//table[@id='ctl00_ContentPlaceHolderCol1_GridView1']")[0] for event in table.xpath(".//tr")[1:]: tds = event.xpath("./td") committee = tds[0].text_content().strip() descr = [x.text_content() for x in tds[1].xpath(".//span")] if len(descr) != 1: raise Exception descr = descr[0].replace('.', '').strip() when = tds[2].text_content().strip() where = tds[3].text_content().strip() notice = tds[4].xpath(".//a")[0] notice_href = notice.attrib['href'] notice_name = notice.text when = dt.datetime.strptime(when, "%m/%d/%Y %I:%M %p") when = pytz.utc.localize(when) event = Event(name=descr, start_time=when, classification='committee-meeting', description=descr, location_name=where, timezone=tz.zone) if "/" in committee: committees = committee.split("/") else: committees = [committee] for committee in committees: if "INFO" not in committee: committee = self.short_ids.get("committee", { "chamber": "unknown", "name": committee }) else: committee = { "chamber": "joint", "name": committee, } event.add_committee(committee['name'], note='host') event.add_source(URL) event.add_document(notice_name, notice_href, media_type='text/html') for bill in self.get_related_bills(notice_href): a = event.add_agenda_item(description=bill['descr']) a.add_bill(bill['bill_id'], note=bill['type']) yield event
def scrape(self): calendar_url = "http://dccouncil.us/calendar" data = self.get(calendar_url).text doc = lxml.html.fromstring(data) committee_regex = re.compile("(Committee .*?)will") event_list = doc.xpath("//div[@class='event-description-dev']") for event in event_list: place_and_time = event.xpath(".//div[@class='event-description-dev-metabox']/p/text()") when = " ".join([place_and_time[0].strip(), place_and_time[1].strip()]) if len(place_and_time) > 2: location = place_and_time[2] else: location = "unknown" # when is now of the following format: # Wednesday, 2/25/2015 9:30am when = datetime.datetime.strptime(when, "%A, %m/%d/%Y %I:%M%p") description_content = event.xpath(".//div[@class='event-description-content-dev']")[0] description_lines = description_content.xpath("./*") name = description_lines[0].text_content() desc_without_title = " ".join(d.text_content() for d in description_lines[1:]) description = re.sub(r'\s+', " ", description_content.text_content()).strip() potential_bills = description_content.xpath(".//li") committee = committee_regex.search(desc_without_title) event_type = 'other' if committee is not None: committee = committee.group(1).strip() event_type = 'committee:meeting' e = Event(name=name, description=description, start_date=self._tz.localize(when), location_name=location, classification=event_type, ) for b in potential_bills: bill = b.xpath("./a/text()") if len(bill) == 0: continue bill = bill[0] bill_desc = b.text_content().replace(bill, "").strip(", ").strip() ses, num = bill.split("-") bill = ses.replace(" ", "") + "-" + num.zfill(4) item = e.add_agenda_item(bill_desc) item.add_bill(bill) e.add_source(calendar_url) if committee: e.add_participant(committee, type='organization', note='host') yield e
def scrape_upper(self): listing_url = 'https://www.senate.mo.gov/hearingsschedule/hrings.htm' html = self.get(listing_url).text # The HTML here isn't wrapped in a container per-event # which makes xpath a pain. So string split by <hr> # then parse each event's fragment for cleaner results for fragment in html.split('<hr />')[1:]: page = lxml.html.fromstring(fragment) when_date = self.row_content(page, 'Date:') when_time = self.row_content(page, 'Time:') location = self.row_content(page, 'Room:') location = '{}, {}'.format( location, '201 W Capitol Ave, Jefferson City, MO 65101' ) # com = self.row_content(page, 'Committee:') com = page.xpath('//td[descendant::b[contains(text(),"Committee")]]/a/text()')[0] com = com.split(', Senator')[0].strip() start_date = self._TZ.localize( dateutil.parser.parse('{} {}'.format(when_date, when_time)) ) event = Event( start_date=start_date, name=com, location_name=location ) event.add_source(listing_url) event.add_participant( com, type='committee', note='host', ) for bill_table in page.xpath('//table[@width="85%" and @border="0"]'): bill_link = '' if bill_table.xpath(self.bill_link_xpath): agenda_line = bill_table.xpath('string(tr[2])').strip() agenda_item = event.add_agenda_item(description=agenda_line) bill_link = bill_table.xpath(self.bill_link_xpath)[0].strip() agenda_item.add_bill(bill_link) else: agenda_line = bill_table.xpath('string(tr[1])').strip() agenda_item = event.add_agenda_item(description=agenda_line) yield event
def scrape_chamber(self, chamber): url = utils.urls['events'][chamber] page = self.get(url).text page = lxml.html.fromstring(page) page.make_links_absolute(url) for table in page.xpath( '//table[@class="CMS-MeetingDetail-CurrMeeting"]'): date_string = table.xpath( 'ancestor::div[@class="CMS-MeetingDetail"]/div/a/@name')[0] for row in table.xpath('tr'): time_string = row.xpath( 'td[@class="CMS-MeetingDetail-Time"]/text()')[0].strip() description = row.xpath( 'td[@class="CMS-MeetingDetail-Agenda"]/div/div' )[-1].text_content().strip() location = row.xpath('td[@class="CMS-MeetingDetail-Location"]' )[0].text_content().strip() committees = row.xpath( './/div[@class="CMS-MeetingDetail-Agenda-CommitteeName"]/a' ) bills = row.xpath('.//a[contains(@href, "billinfo")]') try: start_time = datetime.datetime.strptime( '{} {}'.format(date_string, time_string), '%m/%d/%Y %I:%M %p', ) except ValueError: break event = Event( name=description, start_time=self._tz.localize(start_time), location_name=location, timezone=self._tz.zone, ) event.add_source(url) if bills or committees: item = event.add_agenda_item(description) for bill in bills: parsed = urllib.parse.urlparse(bill.get('href')) qs = urllib.parse.parse_qs(parsed.query) item.add_bill('{}{} {}'.format(qs['body'], qs['type'], qs['bn'])) for committee in committees: parsed = urllib.parse.urlparse(committee.get('href')) qs = urllib.parse.parse_qs(parsed.query) item.add_committee( re.sub(r' \([S|H]\)$', '', committee.text), id=qs.get('Code'), ) yield event
def scrape(self): EVENTS_URL = 'http://www.akleg.gov/basis/Meeting/Find' events = self.lxmlize(EVENTS_URL).xpath('//ul[@id="meetingResults"]/li') for info in events: event_url = info.xpath('span[@class="col04"]/a/@href')[0] doc = self.lxmlize(event_url) # Skip events that are placeholders or tentative # Also skip whole-chamber events if any(x.strip().startswith("No Meeting") for x in doc.xpath('//div[@class="schedule"]//text()')) \ or "session" in \ info.xpath('span[@class="col01"]/text()')[0].lower(): continue name = " ".join( x.strip() for x in doc.xpath('//div[@class="schedule"]//text()') if x.strip() ) # Skip events with no name if not name: continue event = Event( start_date=self._TZ.localize( datetime.datetime.strptime( info.xpath('span[@class="col02"]/text()')[0], self._DATETIME_FORMAT, ) ), name=name, location_name=doc.xpath( '//div[@class="heading-container"]/span/text()' )[0].title() ) event.add_participant( info.xpath('span[@class="col01"]/text()')[0].title(), type='committee', note='host', ) for document in doc.xpath('//td[@data-label="Document"]/a'): event.add_document( document.xpath('text()')[0], url=document.xpath('@href')[0] ) event.add_source(EVENTS_URL) event.add_source(event_url.replace(" ", "%20")) yield event
def scrape_event_page(self, event): url = event.attrib['href'] page = self.lxmlize(url) title = page.xpath("//h2[@class='evlist_header']") title = title[0].text.strip() if title else None if title is None: return if "CANCELED" in title: return info = page.xpath( "//div[@style='position:relative;margin-right:40px;']")[0] blocks = info.xpath(".//div") ret = {} for block in blocks: els = block.xpath("./*") if not els: continue le = els[0] if le.tag != 'label': continue label, div = els ltex = label.text_content().strip() dtex = div.text_content().strip() ret[ltex] = dtex when = dt.datetime.utcnow() date, start, end = (x.strip() for x in ret['When:'].split("\n")) start = re.sub("^@", "", start).strip() end = end.replace("-", "").strip() replace = [ ('Apr', 'April'), ] skip = ["Occurs every"] for k, v in replace: date = date.replace(k, v).strip() if True in (x in end for x in skip): return start = "%s %s" % (date, start) end = "%s %s" % (date, end) start, end = (dt.datetime.strptime(x, "%B %d, %Y %I:%M %p") for x in (start, end)) event = Event(name=title, location=ret['Where:'], when=start, end=end) event.add_source(url) yield event
def scrape(self, chamber=None): URL = 'http://utahlegislature.granicus.com/ViewPublisherRSS.php?view_id=2&mode=agendas' doc = self.lxmlize(URL) events = doc.xpath('//item') for info in events: title_and_date = info.xpath('title/text()')[0].split(" - ") title = title_and_date[0] when = title_and_date[-1] # if not when.endswith(session[ :len("20XX")]): # continue event = Event(name=title, start_date=self._tz.localize(datetime.datetime.strptime(when, '%b %d, %Y')), location_name='State Capitol' ) event.add_source(URL) url = re.search(r'(http://.*?)\s', info.text_content()).group(1) try: doc = self.lxmlize(url) except HTTPError: self.logger.warning("Page missing, skipping") continue event.add_source(url) committee = doc.xpath('//a[text()="View committee page"]/@href') if committee: committee_doc = self.lxmlize(committee[0]) committee_name = committee_doc.xpath( '//h3[@class="heading committee"]/text()')[0].strip() event.add_participant(committee_name, type='committee', note='host') documents = doc.xpath('.//td') for document in documents: url = re.search(r'(http://.*?pdf)', document.xpath('@onclick')[0]) if url is None: continue url = url.group(1) event.add_document( note=document.xpath('text()')[0], url=url, media_type='application/pdf' ) bills = document.xpath('@onclick') for bill in bills: if "bills/static" in bill: bill_name = bill.split("/")[-1].split(".")[0] item = event.add_agenda_item('Bill up for discussion') item.add_bill(bill_name) yield event
def scrape(self): page = self.lxmlize(calurl) events = page.xpath("//table[@class='agenda-body']//tr")[1:] for event in events: comit_url = event.xpath( ".//a[contains(@href, '/Pages/comm-info.aspx?c=')]") if len(comit_url) != 1: raise Exception comit_url = comit_url[0] who = self.scrape_participants(comit_url.attrib['href']) tds = event.xpath("./*") date = tds[0].text_content().strip() cttie = tds[1].text_content().strip() _chamber, cttie = [x.strip() for x in cttie.split(" - ", 1)] info = tds[2] name = info.xpath("./a[contains(@href, 'raw')]")[0] notice = name.attrib['href'] name = name.text time, where = info.xpath("./i/text()") what = tds[3].text_content() what = what.replace("Items: ", "") if "(None)" in what: continue what = [x.strip() for x in what.split(";")] when = ", ".join([date, str(dt.datetime.now().year), time]) when = dt.datetime.strptime(when, "%a %b %d, %Y, %I:%M %p") event = Event( name=name, location_name=where, start_date=self._tz.localize(when), ) event.add_source(calurl) event.add_committee(cttie, note='host') event.add_document("notice", notice, media_type='application/pdf') for entry in what: item = event.add_agenda_item(entry) if entry.startswith('AB') or entry.startswith('SB'): item.add_bill(entry) for thing in who: event.add_person(thing['name']) yield event
def scrape(self): page = self.lxmlize(calurl) events = page.xpath("//table[@class='agenda-body']//tr")[1:] for event in events: comit_url = event.xpath( ".//a[contains(@href, '/Pages/comm-info.aspx?c=')]") if len(comit_url) != 1: raise Exception comit_url = comit_url[0] who = self.scrape_participants(comit_url.attrib['href']) tds = event.xpath("./*") date = tds[0].text_content().strip() cttie = tds[1].text_content().strip() _chamber, cttie = [x.strip() for x in cttie.split(" - ", 1)] info = tds[2] name = info.xpath("./a[contains(@href, 'raw')]")[0] notice = name.attrib['href'] name = name.text time, where = info.xpath("./i/text()") what = tds[3].text_content() what = what.replace("Items: ", "") if "(None)" in what: continue what = [x.strip() for x in what.split(";")] when = ", ".join([date, str(dt.datetime.now().year), time]) when = dt.datetime.strptime(when, "%a %b %d, %Y, %I:%M %p") event = Event( name=name, location_name=where, start_date=self._tz.localize(when), ) event.add_source(calurl) event.add_committee(cttie, note='host') event.add_document("notice", notice, media_type='application/pdf') for entry in what: item = event.add_agenda_item(entry) if entry.startswith('AB') or entry.startswith('SB'): item.add_bill(entry) for thing in who: event.add_person(thing['name']) yield event
def scrape_event_page(self, event): url = event.attrib['href'] page = self.lxmlize(url) title = page.xpath("//h2[@class='evlist_header']") title = title[0].text.strip() if title else None if title is None: return if "CANCELED" in title: return info = page.xpath("//div[@style='position:relative;margin-right:40px;']")[0] blocks = info.xpath(".//div") ret = {} for block in blocks: els = block.xpath("./*") if not els: continue le = els[0] if le.tag != 'label': continue label, div = els ltex = label.text_content().strip() dtex = div.text_content().strip() ret[ltex] = dtex when = dt.datetime.utcnow() date, start, end = (x.strip() for x in ret['When:'].split("\n")) start = re.sub("^@", "", start).strip() end = end.replace("-", "").strip() replace = [ ('Apr', 'April'), ] skip = ["Occurs every"] for k, v in replace: date = date.replace(k, v).strip() if True in (x in end for x in skip): return start = "%s %s" % (date, start) end = "%s %s" % (date, end) start, end = (dt.datetime.strptime(x, "%B %d, %Y %I:%M %p") for x in (start, end)) event = Event( name=title, location=ret['Where:'], when=start, end=end) event.add_source(url) yield event
def scrape(self): tz = pytz.timezone("US/Eastern") get_short_codes(self) page = self.lxmlize(URL) table = page.xpath( "//table[@id='ctl00_ContentPlaceHolderCol1_GridView1']")[0] for event in table.xpath(".//tr")[1:]: tds = event.xpath("./td") committee = tds[0].text_content().strip() descr = [x.text_content() for x in tds[1].xpath(".//span")] if len(descr) != 1: raise Exception descr = descr[0].replace('.', '').strip() when = tds[2].text_content().strip() where = tds[3].text_content().strip() notice = tds[4].xpath(".//a")[0] notice_href = notice.attrib['href'] notice_name = notice.text when = dt.datetime.strptime(when, "%m/%d/%Y %I:%M %p") when = pytz.utc.localize(when) event = Event(name=descr, start_time=when, classification='committee-meeting', description=descr, location_name=where, timezone=tz.zone) if "/" in committee: committees = committee.split("/") else: committees = [committee] for committee in committees: if "INFO" not in committee: committee = self.short_ids.get("committee", {"chamber": "unknown", "name": committee}) else: committee = { "chamber": "joint", "name": committee, } event.add_committee(committee['name'], note='host') event.add_source(URL) event.add_document(notice_name, notice_href, media_type='text/html') for bill in self.get_related_bills(notice_href): a = event.add_agenda_item(description=bill['descr']) a.add_bill( bill['bill_id'], note=bill['type'] ) yield event
def parse_event(self, row, chamber): # sample event available at http://www.akleg.gov/apptester.html committee_code = row.xpath("string(Sponsor)").strip() if committee_code in self.COMMITTEES[chamber]: committee_name = "{} {}".format( self.COMMITTEES_PRETTY[chamber], self.COMMITTEES[chamber][committee_code]["name"], ) else: committee_name = "{} {}".format( self.COMMITTEES_PRETTY[chamber], 'MISCELLANEOUS', ) name = "{} {}".format(self.COMMITTEES_PRETTY[chamber], row.xpath("string(Title)").strip()) # If name is missing, make it "<CHAMBER> <COMMITTEE NAME>" if name == "": name = committee_name location = row.xpath("string(Location)").strip() # events with no location all seem to be committee hearings if location == "": location = "Alaska State Capitol, 120 4th St, Juneau, AK 99801" start_date = dateutil.parser.parse(row.xpath("string(Schedule)")) # todo: do i need to self._TZ.localize() ? event = Event(start_date=start_date, name=name, location_name=location) event.add_source("http://w3.akleg.gov/index.php#tab4") if committee_code in self.COMMITTEES[chamber]: event.add_participant(committee_name, type="committee", note="host") for item in row.xpath("Agenda/Item"): agenda_desc = item.xpath("string(Text)").strip() if agenda_desc != "": agenda_item = event.add_agenda_item(description=agenda_desc) if item.xpath("BillRoot"): bill_id = item.xpath("string(BillRoot)") # AK Bill ids have a bunch of extra spaces bill_id = re.sub(r"\s+", " ", bill_id) agenda_item.add_bill(bill_id) yield event
def scrape_chamber(self, chamber, session): cha = {"upper": "7", "lower": "3", "other": "4"}[chamber] print_format = "%m/%d/%Y" now = dt.datetime.now() start = now.strftime(print_format) end = (now + timedelta(days=30)).strftime(print_format) url = event_page % (cha, start, end) page = self.lxmlize(url) committees = page.xpath( "//a[contains(@href,'Agendas?CommitteeId')]/@href") for comm in committees: comm_page = self.lxmlize(comm) meetings = comm_page.xpath( "//li[contains(@class, 'partialagendaitems')]") for meeting in meetings: heading, content = meeting.xpath("./ul/li") who, when = heading.text.split(" - ") meeting_title = "Scheduled meeting of %s" % who.strip() where_lines = content.text_content().split("\r\n") where = "\r\n".join([l.strip() for l in where_lines[6:9]]) when = dt.datetime.strptime(when.strip(), "%m/%d/%Y %I:%M:%S %p") location = (where or '').strip() or "unknown" event = Event(name=meeting_title, start_time=self._tz.localize(when), timezone=self._tz.zone, location_name=location, description=meeting_title) event.add_participant(who.strip(), type='committee', note='host') event.add_source(url) # only scraping public hearing bills for now. bills = meeting.xpath( ".//div[text() = 'Public Hearing']/following-sibling::li" "[contains(@class, 'visible-lg')]") for bill in bills: bill_id, descr = bill.xpath("./a/text()")[0].split(" - ") item = event.add_agenda_item(descr.strip()) item.add_bill(bill_id.strip()) yield event
def scrape_lower_item(self, page): # print(lxml.etree.tostring(page, pretty_print=True)) com = self.table_row_content(page, 'Committee:') when_date = self.table_row_content(page, 'Date:') when_time = self.table_row_content(page, 'Time:') location = self.table_row_content(page, 'Location:') if 'house hearing room' in location.lower(): location = '{}, {}'.format( location, '201 W Capitol Ave, Jefferson City, MO 65101' ) # fix some broken times, e.g. '12 :00' when_time = when_time.replace(' :', ':') # some times have extra info after the AM/PM if 'upon' in when_time: when_time = when_time.split('AM', 1)[0] when_time = when_time.split('PM', 1)[0] start_date = self._TZ.localize( dateutil.parser.parse('{} {}'.format(when_date, when_time)) ) event = Event( start_date=start_date, name=com, location_name=location ) event.add_source('https://house.mo.gov/HearingsTimeOrder.aspx') event.add_participant( com, type='committee', note='host', ) # different from general MO link xpath due to the <b> house_link_xpath = './/a[contains(@href, "Bill.aspx") ' \ 'or contains(@href, "bill.aspx")]/b/text()' for bill_title in page.xpath(house_link_xpath): bill_no = bill_title.split('--')[0].strip() bill_no = bill_no.replace('HCS', '').strip() agenda_item = event.add_agenda_item(description=bill_title) agenda_item.add_bill(bill_no) yield event
def scrape_page(self, url, session, chamber): html = self.get(url).text doc = lxml.html.fromstring(html) doc.make_links_absolute(url) ctty_name = doc.xpath("//span[@class='heading']")[0].text_content() tables = doc.xpath("//table[@cellpadding='3']") info = tables[0] rows = info.xpath(".//tr") metainf = {} for row in rows: tds = row.xpath(".//td") key = tds[0].text_content().strip() value = tds[1].text_content().strip() metainf[key] = value where = metainf['Location:'] subject_matter = metainf['Subject Matter:'] description = "{}, {}".format(ctty_name, subject_matter) datetime = metainf['Scheduled Date:'] datetime = re.sub("\s+", " ", datetime) repl = { "AM": " AM", "PM": " PM" # Space shim. } for r in repl: datetime = datetime.replace(r, repl[r]) datetime = self.localize(dt.datetime.strptime(datetime, "%b %d, %Y %I:%M %p")) event = Event(description, start_date=datetime, location_name=where) event.add_source(url) if ctty_name.startswith('Hearing Notice For'): ctty_name.replace('Hearing Notice For', '') event.add_participant(ctty_name, 'organization') bills = tables[1] for bill in bills.xpath(".//tr")[1:]: tds = bill.xpath(".//td") if len(tds) < 4: continue # First, let's get the bill ID: bill_id = tds[0].text_content() agenda_item = event.add_agenda_item(bill_id) agenda_item.add_bill(bill_id) return event
def parse_event(self, row, chamber): # sample event available at http://www.akleg.gov/apptester.html committee_code = row.xpath('string(Sponsor)').strip() committee_name = '{} {}'.format( self.COMMITTEES_PRETTY[chamber], self.COMMITTEES[chamber][committee_code]['name'] ) name = '{} {}'.format( self.COMMITTEES_PRETTY[chamber], row.xpath('string(Title)').strip() ) # If name is missing, make it "<CHAMBER> <COMMITTEE NAME>" if name == '': name = committee_name location = row.xpath('string(Location)').strip() # events with no location all seem to be committee hearings if location == '': location = 'Alaska State Capitol, 120 4th St, Juneau, AK 99801' start_date = dateutil.parser.parse(row.xpath('string(Schedule)')) # todo: do i need to self._TZ.localize() ? event = Event( start_date=start_date, name=name, location_name=location ) event.add_source('http://w3.akleg.gov/index.php#tab4') event.add_participant( committee_name, type='committee', note='host', ) for item in row.xpath('Agenda/Item'): agenda_desc = item.xpath('string(Text)').strip() if agenda_desc != '': agenda_item = event.add_agenda_item(description=agenda_desc) if item.xpath('BillRoot'): bill_id = item.xpath('string(BillRoot)') # AK Bill ids have a bunch of extra spaces bill_id = re.sub(r'\s+', ' ', bill_id) agenda_item.add_bill(bill_id) yield event
def scrape_chamber(self, chamber): url = utils.urls['events'][chamber] page = self.get(url).text page = lxml.html.fromstring(page) page.make_links_absolute(url) for table in page.xpath('//table[@class="CMS-MeetingDetail-CurrMeeting"]'): date_string = table.xpath('ancestor::div[@class="CMS-MeetingDetail"]/div/a/@name')[0] for row in table.xpath('tr'): time_string = row.xpath('td[@class="CMS-MeetingDetail-Time"]/text()')[0].strip() description = row.xpath( 'td[@class="CMS-MeetingDetail-Agenda"]/div/div' )[-1].text_content().strip() location = row.xpath( 'td[@class="CMS-MeetingDetail-Location"]' )[0].text_content().strip() committees = row.xpath('.//div[@class="CMS-MeetingDetail-Agenda-CommitteeName"]/a') bills = row.xpath('.//a[contains(@href, "billinfo")]') try: start_time = datetime.datetime.strptime( '{} {}'.format(date_string, time_string), '%m/%d/%Y %I:%M %p', ) except ValueError: break event = Event( name=description, start_time=self._tz.localize(start_time), location_name=location, timezone=self._tz.zone, ) event.add_source(url) if bills or committees: item = event.add_agenda_item(description) for bill in bills: parsed = urllib.parse.urlparse(bill.get('href')) qs = urllib.parse.parse_qs(parsed.query) item.add_bill('{}{} {}'.format(qs['body'], qs['type'], qs['bn'])) for committee in committees: parsed = urllib.parse.urlparse(committee.get('href')) qs = urllib.parse.parse_qs(parsed.query) item.add_committee( re.sub(r' \([S|H]\)$', '', committee.text), id=qs.get('Code'), ) yield event
def scrape_page(self, url, session, chamber): html = self.get(url).text doc = lxml.html.fromstring(html) doc.make_links_absolute(url) ctty_name = doc.xpath("//span[@class='heading']")[0].text_content() tables = doc.xpath("//table[@cellpadding='3']") info = tables[0] rows = info.xpath(".//tr") metainf = {} for row in rows: tds = row.xpath(".//td") key = tds[0].text_content().strip() value = tds[1].text_content().strip() metainf[key] = value where = metainf['Location:'] subject_matter = metainf['Subject Matter:'] description = "{}, {}".format(ctty_name, subject_matter) datetime = metainf['Scheduled Date:'] datetime = re.sub(r"\s+", " ", datetime) repl = { "AM": " AM", "PM": " PM" # Space shim. } for r in repl: datetime = datetime.replace(r, repl[r]) datetime = self.localize( dt.datetime.strptime(datetime, "%b %d, %Y %I:%M %p")) event = Event(description, start_date=datetime, location_name=where) event.add_source(url) if ctty_name.startswith('Hearing Notice For'): ctty_name.replace('Hearing Notice For', '') event.add_participant(ctty_name, 'organization') bills = tables[1] for bill in bills.xpath(".//tr")[1:]: tds = bill.xpath(".//td") if len(tds) < 4: continue # First, let's get the bill ID: bill_id = tds[0].text_content() agenda_item = event.add_agenda_item(bill_id) agenda_item.add_bill(bill_id) return event
def scrape(self, session=None): if session is None: session = self.latest_session() self.info('no session specified, using %s', session) year_abr = ((int(session) - 209) * 2) + 2000 self._init_mdb(year_abr) self.initialize_committees(year_abr) records = self.access_to_csv("Agendas") for record in records: if record['Status'] != "Scheduled": continue description = record['Comments'] related_bills = [] for bill in re.findall(r"(A|S)(-)?(\d{4})", description): related_bills.append({ "bill_id": "%s %s" % (bill[0], bill[2]), "descr": description }) date_time = "%s %s" % (record['Date'], record['Time']) date_time = dt.datetime.strptime(date_time, "%m/%d/%Y %I:%M %p") try: hr_name = self._committees[record['CommHouse']] except KeyError: self.warning('unknown committee code %s, skipping', record['CommHouse']) description = 'Meeting of the {}'.format(hr_name) event = Event( name=description, start_date=self._tz.localize(date_time), location_name=record['Location'] or 'Statehouse', ) item = None for bill in related_bills: item = item or event.add_agenda_item(description) item.add_bill(bill['bill_id']) event.add_committee( hr_name, id=record['CommHouse'], note='host', ) event.add_source('http://www.njleg.state.nj.us/downloads.asp') yield event
def scrape(self, session=None): if session is None: session = self.latest_session() self.info('no session specified, using %s', session) year_abr = ((int(session) - 209) * 2) + 2000 self._init_mdb(year_abr) self.initialize_committees(year_abr) records = self.access_to_csv("Agendas") for record in records: if record['Status'] != "Scheduled": continue description = record['Comments'] related_bills = [] for bill in re.findall(r"(A|S)(-)?(\d{4})", description): related_bills.append({ "bill_id": "%s %s" % (bill[0], bill[2]), "descr": description }) date_time = "%s %s" % (record['Date'], record['Time']) date_time = dt.datetime.strptime(date_time, "%m/%d/%Y %I:%M %p") try: hr_name = self._committees[record['CommHouse']] except KeyError: self.warning('unknown committee code %s, skipping', record['CommHouse']) description = 'Meeting of the {}'.format(hr_name) event = Event( name=description, start_date=self._tz.localize(date_time), location_name=record['Location'] or 'Statehouse', ) item = None for bill in related_bills: item = item or event.add_agenda_item(description) item.add_bill(bill['bill_id']) event.add_committee( hr_name, id=record['CommHouse'], note='host', ) event.add_source('http://www.njleg.state.nj.us/downloads.asp') yield event
def categorize_data(self, csv_data): return_objs = [] Contribution = namedtuple('Contribution', self.csv_header_row.replace(' ', '_')) for line in csv_data.split('\n'): # explicity defining delimiter because otherwise fails in case of single line if not line: continue # cur_obj will be the person or organization that made the contribution cur_obj = None contribution = Contribution(*line.split(',')) if contribution.Contributor_Type in self.business_contribution_types: cur_obj = Organization(contribution.Contributor_Name) elif contribution.Contributor_Type in self.individual_contribution_types: cur_obj = Person(contribution.Contributor_Name) elif contribution.Contributor_Type == 'Unknown/Anonymous': if contribution.Contributor_Name: #ignoring un-named contributors #these look like catch-all business contributions cur_obj = Organization(contribution.Contributor_Name) if cur_obj: #we don't set cur_obj in the event that there was an #anonymous/unknown contribution without a Contribution_Name #so we need to check that it exists before adding to it cur_obj.add_source(url=self.search_url) cur_obj.source_identified = True if contribution.Contributor_Address: cur_obj.add_contact_detail(type='address', value=contribution.Contributor_Address) if contribution.Employer_Name: cur_obj.extras['Employer'] = contribution.Employer_Name if contribution.Employer_Occupation: cur_obj.extras['Occupation'] = contribution.Employer_Occupation #recipiant_obj is the organization that received the contribution recipiant_obj = Organization(contribution.Receiving_Committee) recipiant_obj.extras['Office'] = contribution.Office recipiant_obj.extras['Filing Period'] = contribution.Filing_Period recipiant_obj.extras['Fundtype'] = contribution.Fundtype #transaction is the event linking the donor and recipiant transaction = Event('Contribution', contribution.Contribution_Date, 'EST', 'Maryland') #EST and Maryland b/c MD transaction.extras['Contribution Amount'] = contribution.Contribution_Amount transaction.extras['Contribution Type'] = contribution.Contribution_Type transaction.add_source(url=self.search_url) #transaction.source_identified = True transaction.participants.append(cur_obj.as_dict()) transaction.participants.append(recipiant_obj.as_dict()) yield (cur_obj, recipiant_obj, transaction) else: yield []
def scrape(self): curdate = None page = self.lxmlize(CAL_PAGE) for el in page.xpath("//div[@id='Section1']/*"): if el.tag[0] == 'h': when = WHEN.findall(el.text_content()) when = when[0] if when else None if when is None: continue curdate = " ".join(when) if (el.tag == 'p'): # and el.attrib.get('class') == 'MsoNormal'): els = el.xpath("./*") agenda = el.xpath(".//a[contains(@href, 'Archive.aspx')]") agenda = agenda[0] if agenda else None if agenda is None: continue info = self.cleanup(el.text_content()) when = DT.findall(info) when = when[0] if when else None if when is None: continue people = el.xpath(".//personname") places = el.xpath(".//place") time, ampm = when if curdate is None: self.warning( "Can't scrape, since I don't know what date it is") continue tbuf = " ".join([curdate, time, ampm]) obj = dt.datetime.strptime(tbuf, "%B %d %Y %I:%M %p") try: _, where = info.rsplit(u"–", 1) except ValueError: continue where = where.replace(u" ", " ") where = re.sub("\s+", " ", where).strip() where = re.sub("agenda$", "", where).strip() event = Event(name=info, when=obj, location=where) event.add_source(CAL_PAGE) yield event
def scrape(self, session=None): if session is None: session = self.latest_session() year_slug = self.jurisdiction.get_year_slug(session) url = "http://legislature.vermont.gov/committee/loadAllMeetings/{}".format( year_slug ) json_data = self.get(url).text events = json.loads(json_data)["data"] for info in events: # Determine when the committee meets if ( info["TimeSlot"] == "" or info["TimeSlot"] == "1" or info["TimeSlot"] == 1 ): start_time = datetime.datetime.strptime( info["MeetingDate"], "%A, %B %d, %Y" ) all_day = True else: try: start_time = datetime.datetime.strptime( info["MeetingDate"] + ", " + info["TimeSlot"], "%A, %B %d, %Y, %I:%M %p", ) except ValueError: start_time = datetime.datetime.strptime( info["MeetingDate"] + ", " + info["StartTime"], "%A, %B %d, %Y, %I:%M %p", ) all_day = False event = Event( start_date=self.TIMEZONE.localize(start_time), all_day=all_day, name="Meeting of the {}".format(info["LongName"]), description="committee meeting", location_name="{0}, Room {1}".format( info["BuildingName"], info["RoomNbr"] ), ) event.add_source(url) event.add_committee(name=info["LongName"], note="host") yield event
def scrape_chamber(self, chamber): grouped_hearings = defaultdict(list) for hearing in self.session.query(CACommitteeHearing): location = (self.session.query(CALocation).filter_by( location_code=hearing.location_code)[0].description) date = self._tz.localize(hearing.hearing_date) chamber_abbr = location[0:3] event_chamber = {"Asm": "lower", "Sen": "upper"}[chamber_abbr] if event_chamber != chamber: continue grouped_hearings[(location, date)].append(hearing) for ((location, date), hearings) in grouped_hearings.items(): # Get list of bill_ids from the database. bill_ids = [hearing.bill_id for hearing in hearings] bills = [ "%s %s" % re.match(r"\d+([^\d]+)(\d+)", bill).groups() for bill in bill_ids ] # Dereference the committee_nr number and get display name. msg = "More than one committee meeting at (location, date) %r" msg = msg % ((location, date), ) assert len(set(hearing.committee_nr for hearing in hearings)) == 1, msg committee_name = _committee_nr[hearings.pop().committee_nr] desc = "Committee Meeting: " + committee_name event = Event(name=desc, start_date=date, location_name=committee_name) for bill_id in bills: if "B" in bill_id: type_ = "bill" else: type_ = "resolution" item = event.add_agenda_item("consideration") item.add_bill(bill_id, note=type_) event.add_person(committee_name + " Committee", note="host") event.add_source("https://downloads.leginfo.legislature.ca.gov/") yield event
def scrape(self, session=None, chamber=None): if not session: session = self.latest_session() self.info('no session specified, using %s', session) url = "ftp://www.arkleg.state.ar.us/dfadooas/ScheduledMeetings.txt" page = self.get(url) page = csv.reader(StringIO(page.text), delimiter='|') for row in page: # Deal with embedded newline characters, which cause fake new rows LINE_LENGTH = 11 while len(row) < LINE_LENGTH: row += next(page) desc = row[7].strip() match = re.match(r'^(.*)- (HOUSE|SENATE)$', desc) if match: comm = match.group(1).strip() comm = re.sub(r'\s+', ' ', comm) location = row[5].strip() or 'Unknown' when = datetime.datetime.strptime(row[2], '%Y-%m-%d %H:%M:%S') when = self._tz.localize(when) # Only assign events to a session if they are in the same year # Given that session metadata have some overlap and # missing end dates, this is the best option available session_year = int(session[:4]) if session_year != when.year: continue description = "%s MEETING" % comm event = Event( name=description, start_time=when, location_name=location, description=description, timezone=self._tz.zone ) event.add_source(url) event.add_participant(comm, type='committee', note='host') # time = row[3].strip() # if time in TIMECODES: # event['notes'] = TIMECODES[time] yield event
def scrape_chamber(self, chamber): grouped_hearings = defaultdict(list) for hearing in self.session.query(CACommitteeHearing): location = self.session.query(CALocation).filter_by( location_code=hearing.location_code)[0].description date = self._tz.localize(hearing.hearing_date) chamber_abbr = location[0:3] event_chamber = {'Asm': 'lower', 'Sen': 'upper'}[chamber_abbr] if event_chamber != chamber: continue grouped_hearings[(location, date)].append(hearing) for ((location, date), hearings) in grouped_hearings.items(): # Get list of bill_ids from the database. bill_ids = [hearing.bill_id for hearing in hearings] bills = ["%s %s" % re.match(r'\d+([^\d]+)(\d+)', bill).groups() for bill in bill_ids] # Dereference the committee_nr number and get display name. msg = 'More than one committee meeting at (location, date) %r' msg = msg % ((location, date),) assert len(set(hearing.committee_nr for hearing in hearings)) == 1, msg committee_name = _committee_nr[hearings.pop().committee_nr] desc = 'Committee Meeting: ' + committee_name event = Event( name=desc, start_date=date, location_name=committee_name, ) for bill_id in bills: if 'B' in bill_id: type_ = 'bill' else: type_ = 'resolution' item = event.add_agenda_item('consideration') item.add_bill(bill_id, note=type_) event.add_person(committee_name + ' Committee', note='host') event.add_source('ftp://www.leginfo.ca.gov/pub/bill/') yield event
def scrape_events(self, chamber, event_id): url = '%s%s' % (self.upper_url, event_id) html = self.get(url).text doc = lxml.html.fromstring(html) doc.make_links_absolute(url) rows = doc.xpath("//div[@id='WebPartWPQ2']") # some ids are empty if len(rows): table_data = rows[0].find('table')[1] for link in table_data.iterchildren('td'): td = link.xpath('//td[@class="ms-formbody"]') description = td[18].text when = td[19].text where = td[25].text # type = td[27].text meeting_lead = td[28].text when = datetime.datetime.strptime(when, "%m/%d/%Y %H:%M %p") when = self._tz.localize(when) if where is None or where == "": where = 'State House' event = Event(name=description, start_date=when, location_name=where) if td[20].text is None: participants = meeting_lead else: participants = td[20].text.split(';') if participants: for participant in participants: name = participant.strip().replace('HON.', '', 1) if name != "": event.add_participant(name, type='committee', note='host') event.add_source(url) yield event else: # hack so we dont fail on the first id numbers where there are some gaps # between the numbers that work and not. if event_id > 1700: raise Exception( "Parsing is done we are on future ids that are not used yet." )
def scrape(self): curdate = None page = self.lxmlize(CAL_PAGE) for el in page.xpath("//div[@id='Section1']/*"): if el.tag[0] == 'h': when = WHEN.findall(el.text_content()) when = when[0] if when else None if when is None: continue curdate = " ".join(when) if (el.tag == 'p'): # and el.attrib.get('class') == 'MsoNormal'): els = el.xpath("./*") agenda = el.xpath(".//a[contains(@href, 'Archive.aspx')]") agenda = agenda[0] if agenda else None if agenda is None: continue info = self.cleanup(el.text_content()) when = DT.findall(info) when = when[0] if when else None if when is None: continue people = el.xpath(".//personname") places = el.xpath(".//place") time, ampm = when if curdate is None: self.warning("Can't scrape, since I don't know what date it is") continue tbuf = " ".join([curdate, time, ampm]) obj = dt.datetime.strptime(tbuf, "%B %d %Y %I:%M %p") try: _, where = info.rsplit(u"–", 1) except ValueError: continue where = where.replace(u" ", " ") where = re.sub("\s+", " ", where).strip() where = re.sub("agenda$", "", where).strip() event = Event(name=info, when=obj, location=where) event.add_source(CAL_PAGE) yield event
def scrape_chamber(self, chamber, session, start, end): page = self.get_xml(start, end) for row in xpath(page, '//wa:CommitteeMeeting'): event_cancelled = xpath(row, 'string(wa:Cancelled)') if event_cancelled == 'true': continue event_chamber = xpath(row, 'string(wa:Agency)') if self.chambers[event_chamber] != chamber: continue event_date = datetime.datetime.strptime( xpath(row, 'string(wa:Date)'), "%Y-%m-%dT%H:%M:%S") event_date = self._tz.localize(event_date) event_com = xpath(row, 'string(wa:Committees/' 'wa:Committee/wa:LongName)') agenda_id = xpath(row, 'string(wa:AgendaId)') notes = xpath(row, 'string(wa:Notes)') room = xpath(row, 'string(wa:Room)') building = xpath(row, 'string(wa:Building)') # XML has a wa:Address but it seems useless city = xpath(row, 'string(wa:City)') state = xpath(row, 'string(wa:State)') location = '{}, {}, {} {}'.format( room, building, city, state ) event = Event(name=event_com, start_date=event_date, location_name=location, description=notes) source_url = 'https://app.leg.wa.gov/committeeschedules/Home/Agenda/{}'.format( agenda_id) event.add_source(source_url) event.add_participant(event_com, type='committee', note='host') event.extras['agendaId'] = agenda_id self.scrape_agenda_items(agenda_id, event) yield event
def scrape_event_page(self, session, chamber, url, datetime): page = self.lxmlize(url) info = page.xpath("//p") metainfo = {} plaintext = "" for p in info: content = re.sub("\s+", " ", p.text_content()) plaintext += content + "\n" if ":" in content: key, val = content.split(":", 1) metainfo[key.strip()] = val.strip() committee = metainfo['COMMITTEE'] where = metainfo['PLACE'] if "CHAIR" in where: where, chair = where.split("CHAIR:") metainfo['PLACE'] = where.strip() metainfo['CHAIR'] = chair.strip() chair = None if "CHAIR" in metainfo: chair = metainfo['CHAIR'] plaintext = re.sub("\s+", " ", plaintext).strip() regexp = r"(S|J|H)(B|M|R) (\d+)" bills = re.findall(regexp, plaintext) event = Event( name=committee, start_date=self._tz.localize(datetime), location_name=where ) event.add_source(url) event.add_participant(committee, type='committee', note='host') if chair is not None: event.add_participant(chair, type='legislator', note='chair') for bill in bills: chamber, type, number = bill bill_id = "%s%s %s" % (chamber, type, number) item = event.add_agenda_item('Bill up for discussion') item.add_bill(bill_id) event.add_agenda_item(plaintext) yield event
def scrape_events(self, chamber, event_id): url = '%s%s' % (self.upper_url, event_id) html = self.get(url).text doc = lxml.html.fromstring(html) doc.make_links_absolute(url) rows = doc.xpath("//div[@id='WebPartWPQ2']") # some ids are empty if len(rows): table_data = rows[0].find('table')[1] for link in table_data.iterchildren('td'): td = link.xpath('//td[@class="ms-formbody"]') description = td[18].text when = td[19].text where = td[25].text # type = td[27].text meeting_lead = td[28].text when = datetime.datetime.strptime(when, "%m/%d/%Y %H:%M %p") when = self._tz.localize(when) if where is None or where == "": where = 'State House' event = Event(name=description, start_date=when, location_name=where) if td[20].text is None: participants = meeting_lead else: participants = td[20].text.split(';') if participants: for participant in participants: name = participant.strip().replace('HON.', '', 1) if name != "": event.add_participant(name, type='committee', note='host') event.add_source(url) yield event else: # hack so we dont fail on the first id numbers where there are some gaps # between the numbers that work and not. if event_id > 1700: raise Exception("Parsing is done we are on future ids that are not used yet.")