def scrape_house_weekly_schedule(self, session): url = "http://house.louisiana.gov/H_Sched/Hse_Sched_Weekly.htm" with self.urlopen(url) as page: page = lxml.html.fromstring(page) page.make_links_absolute(url) for link in page.xpath("//img[@alt = 'See Agenda in pdf']/.."): try: guid = link.attrib['href'] except KeyError: continue # Sometimes we have a dead link. This is only on # dead entries. committee = link.xpath("string(../../../td[1])").strip() when_and_where = link.xpath("string(../../../td[2])").strip() location = when_and_where.split(',')[-1] when = parse_datetime(when_and_where, session) description = 'Committee Meeting: %s' % committee event = Event(session, when, 'committee:meeting', description, location=location) event.add_source(url) event.add_participant('host', committee, chamber='lower') event.add_document("Agenda", guid, type='agenda', mimetype="application/pdf") event['link'] = guid self.save_event(event)
def scrape(self, session, chambers): URL = 'http://utahlegislature.granicus.com/ViewPublisherRSS.php?view_id=2&mode=agendas' doc = self.lxmlize(URL) events = doc.xpath('//item') for info in events: title_and_date = info.xpath('title/text()')[0].split(" - ") title = title_and_date[0] when = title_and_date[-1] if not when.endswith(session[:len("20XX")]): continue event = Event(session=session, when=datetime.datetime.strptime(when, '%b %d, %Y'), type='committee:meeting', description=title, location='State Capitol') event.add_source(URL) url = re.search(r'(http://.*?)\s', info.text_content()).group(1) doc = self.lxmlize(url) event.add_source(url) committee = doc.xpath('//a[text()="View committee page"]/@href') if committee: committee_doc = self.lxmlize(committee[0]) committee_name = committee_doc.xpath( '//h3[@class="heading committee"]/text()')[0].strip() if committee_name.lower().startswith("Senate"): chamber = "upper" elif committee_name.lower().startswith("House"): chamber = "lower" else: chamber = "joint" event.add_participant(type='host', participant=committee_name, participant_type='committee', chamber=chamber) documents = doc.xpath('.//td') for document in documents: url = re.search(r'(http://.*?pdf)', document.xpath('@onclick')[0]) if url is None: continue url = url.group(1) event.add_document(name=document.xpath('text()')[0], url=url, mimetype='application/pdf') bills = document.xpath('@onclick') for bill in bills: if "bills/static" in bill: bill_name = bill.split("/")[-1].split(".")[0] event.add_related_bill( bill_name, type='consideration', description='Bill up for discussion') self.save_event(event)
def scrape_house_weekly_schedule(self, session): url = "http://house.louisiana.gov/H_Sched/Hse_Sched_Weekly.htm" page = self.urlopen(url) page = lxml.html.fromstring(page) page.make_links_absolute(url) for link in page.xpath("//img[@alt = 'See Agenda in pdf']/.."): try: guid = link.attrib['href'] except KeyError: continue # Sometimes we have a dead link. This is only on # dead entries. committee = link.xpath("string(../../td[1])").strip() when_and_where = link.xpath("string(../../td[2])").strip() when_and_where = re.sub("\s+", " ", when_and_where).strip() if "@" in when_and_where: continue # Contains no time data. if when_and_where.strip() == "": continue info = re.match(r"(?P<when>.*) (?P<where>H|C.*-.*?)", when_and_where).groupdict() when_and_where = info['when'] location = info['where'] year = datetime.datetime.now().year when = parse_datetime(when_and_where, year) # We can only scrape # when = self._tz.localize(when) bills = self.scrape_bills(when_and_where) description = 'Committee Meeting: %s' % committee event = Event(session, when, 'committee:meeting', description, location=location) event.add_source(url) event.add_participant('host', committee, 'committee', chamber='lower') event.add_document("Agenda", guid, type='agenda', mimetype="application/pdf") for bill in bills: event.add_related_bill(bill, description=when_and_where, type='consideration') event['link'] = guid self.save_event(event)
def scrape_agenda(self, url, session): page = self.lxmlize(url) # Get the date/time info: date_time = page.xpath("//table[@class='time_place']")[0] lines = date_time.xpath("./tr") metainf = {} for line in lines: tds = line.xpath("./td") metainf[tds[0].text_content()] = tds[1].text_content() date = metainf['DATE:'] time = metainf['TIME:'] where = metainf['PLACE:'] fmt = "%A, %B %d, %Y" if time in all_day: datetime = date else: fmt += " %I:%M %p" datetime = "%s %s" % ( date, time ) datetime = dt.datetime.strptime(datetime, fmt) event = Event(session, datetime, 'committee:meeting', 'Meeting Notice', location=where) event.add_source(url) # aight. Let's get us some bills! bills = page.xpath("//b/a") for bill in bills: bill_ft = bill.attrib['href'] event.add_document(bill.text_content(), bill_ft, type="full-text", mimetype="application/pdf") root = bill.xpath('../../*') root = [ x.text_content() for x in root ] bill_id = "".join(root) if "SCHEDULED FOR" in bill_id: continue descr = bill.getparent().getparent().getparent().getnext().getnext( ).text_content() for thing in replace: bill_id = bill_id.replace(thing, replace[thing]) event.add_related_bill(bill_id, description=descr, type='consideration') committee = page.xpath("//span[@id='lblSession']")[0].text_content() chambers = { "house" : "lower", "joint" : "joint", "senate" : "upper" } chamber = "other" for key in chambers: if key in committee.lower(): chamber = chambers[key] event.add_participant("host", committee, chamber=chamber) self.save_event(event)
def scrape(self, session, chambers): get_short_codes(self) page = self.lxmlize(URL) table = page.xpath( "//table[@id='ctl00_ContentPlaceHolderCol1_GridView1']")[0] for event in table.xpath(".//tr")[1:]: tds = event.xpath("./td") committee = tds[0].text_content().strip() bills = [x.text_content() for x in tds[1].xpath(".//a")] descr = [x.text_content() for x in tds[1].xpath(".//span")] if len(descr) != 1: raise Exception descr = descr[0] when = tds[2].text_content().strip() where = tds[3].text_content().strip() notice = tds[4].xpath(".//a")[0] notice_href = notice.attrib['href'] notice_name = notice.text when = dt.datetime.strptime(when, "%m/%d/%Y %I:%M %p") event = Event(session, when, 'committee:meeting', descr, location=where) if "/" in committee: committees = committee.split("/") else: committees = [ committee, ] for committee in committees: if "INFO" not in committee: committee = self.short_ids[committee] else: committee = { "chamber": "joint", "name": committee, } event.add_participant('host', committee['name'], 'committee', chamber=committee['chamber']) event.add_source(URL) event.add_document(notice_name, notice_href, mimetype='text/html') for bill in self.get_related_bills(notice_href): event.add_related_bill(bill['bill_id'], description=bill['descr'], type=bill['type']) self.save_event(event)
def scrape_house_weekly_schedule(self, session): url = "http://house.louisiana.gov/H_Sched/Hse_MeetingSchedule.aspx" page = self.lxmlize(url) meeting_rows = page.xpath('//table[@id = "table229"]/tr') valid_meetings = [ row for row in meeting_rows if row.xpath('./td[1]')[0].text_content().replace(u'\xa0', '') and row.xpath('./td/a/img[contains(@src, "PDF-AGENDA.png")]') and 'Not Meeting' not in row.xpath('./td[2]')[0].text_content() ] for meeting in valid_meetings: try: guid = meeting.xpath('./td/a[descendant::img[contains(@src, ' '"PDF-AGENDA.png")]]/@href')[0] self.logger.debug(guid) except KeyError: continue # Sometimes we have a dead link. This is only on # dead entries. committee_name = meeting.xpath('./td[1]/text()')[0].strip() meeting_string = meeting.xpath('./td[2]')[0].text_content() if "@" in meeting_string: continue # Contains no time data. date, time, location = ( [s.strip() for s in meeting_string.split(',') if s] + [None] * 3)[:3] self.logger.debug(location) year = datetime.datetime.now().year datetime_string = ' '.join((date, str(year), time)) when = datetime.datetime.strptime(datetime_string, '%b %d %Y %I:%M %p') when = self._tz.localize(when) description = 'Committee Meeting: {}'.format(committee_name) self.logger.debug(description) event = Event(session, when, 'committee:meeting', description, location=location) event.add_source(url) event.add_participant('host', committee_name, 'committee', chamber='lower') event.add_document('Agenda', guid, type='agenda', mimetype='application/pdf') event['link'] = guid self.save_event(event)
def scrape(self, session, chambers): get_short_codes(self) page = self.lxmlize(URL) table = page.xpath( "//table[@id='ctl00_ContentPlaceHolderCol1_GridView1']")[0] for event in table.xpath(".//tr")[1:]: tds = event.xpath("./td") committee = tds[0].text_content().strip() bills = [x.text_content() for x in tds[1].xpath(".//a")] descr = [x.text_content() for x in tds[1].xpath(".//span")] if len(descr) != 1: raise Exception descr = descr[0] when = tds[2].text_content().strip() where = tds[3].text_content().strip() notice = tds[4].xpath(".//a")[0] notice_href = notice.attrib['href'] notice_name = notice.text when = dt.datetime.strptime(when, "%m/%d/%Y %I:%M %p") event = Event(session, when, 'committee:meeting', descr, location=where) if "/" in committee: committees = committee.split("/") else: committees = [committee,] for committee in committees: if "INFO" not in committee: committee = self.short_ids.get("committee",{"chamber":"unknown", "name":committee}) else: committee = { "chamber": "joint", "name": committee, } event.add_participant('host', committee['name'], 'committee', chamber=committee['chamber']) event.add_source(URL) event.add_document(notice_name, notice_href, mimetype='text/html') for bill in self.get_related_bills(notice_href): event.add_related_bill( bill['bill_id'], description=bill['descr'], type=bill['type'] ) self.save_event(event)
def scrape_house_weekly_schedule(self, session): url = "http://house.louisiana.gov/H_Sched/Hse_MeetingSchedule.aspx" page = self.lxmlize(url) meeting_rows = page.xpath('//table[@id = "table229"]/tr') valid_meetings = [row for row in meeting_rows if row.xpath( './td[1]')[0].text_content().replace(u'\xa0', '') and row.xpath( './td/a/img[contains(@src, "PDF-AGENDA.png")]') and 'Not Meeting' not in row.xpath( './td[2]')[0].text_content()] for meeting in valid_meetings: try: guid = meeting.xpath('./td/a[descendant::img[contains(@src, ' '"PDF-AGENDA.png")]]/@href')[0] self.logger.debug(guid) except KeyError: continue # Sometimes we have a dead link. This is only on # dead entries. committee_name = meeting.xpath('./td[1]/text()')[0].strip() meeting_string = meeting.xpath('./td[2]')[0].text_content() if "@" in meeting_string: continue # Contains no time data. date, time, location = ([s.strip() for s in meeting_string.split( ',') if s] + [None]*3)[:3] # check for time in date because of missing comma time_srch = re.search('\d{2}:\d{2} (AM|PM)', date) if time_srch: location = time time = time_srch.group() date = date.replace(time, '') self.logger.debug(location) year = datetime.datetime.now().year datetime_string = ' '.join((date, str(year), time)) when = datetime.datetime.strptime(datetime_string, '%b %d %Y %I:%M %p') when = self._tz.localize(when) description = 'Committee Meeting: {}'.format(committee_name) self.logger.debug(description) event = Event(session, when, 'committee:meeting', description, location=location) event.add_source(url) event.add_participant('host', committee_name, 'committee', chamber='lower') event.add_document('Agenda', guid, type='agenda', mimetype='application/pdf') event['link'] = guid self.save_event(event)
def test_event(): e = Event('S1', datetime.datetime(2012, 1, 1), 'meeting', 'event description', 'event location') e.add_document('agenda', 'http://example.com/event/agenda.txt') e.add_related_bill('HB 1', relation='considered') assert_equal(e['documents'], [{'name': 'agenda', 'url': 'http://example.com/event/agenda.txt', 'type': 'other'}]) assert_equal(e['related_bills'], [{'bill_id': 'HB 1', 'relation': 'considered'}])
def scrape_house_weekly_schedule(self, session): url = "http://house.louisiana.gov/H_Sched/Hse_Sched_Weekly.htm" page = self.urlopen(url) page = lxml.html.fromstring(page) page.make_links_absolute(url) for link in page.xpath("//img[@alt = 'See Agenda in pdf']/.."): try: guid = link.attrib['href'] except KeyError: continue # Sometimes we have a dead link. This is only on # dead entries. committee = link.xpath("string(../../td[1])").strip() when_and_where = link.xpath("string(../../td[2])").strip() when_and_where = re.sub("\s+", " ", when_and_where).strip() if "@" in when_and_where: continue # Contains no time data. if when_and_where.strip() == "": continue info = re.match( r"(?P<when>.*) (?P<where>F|N|H|C.*-.*?)", when_and_where ).groupdict() when_and_where = info['when'] location = info['where'] year = datetime.datetime.now().year when = parse_datetime(when_and_where, year) # We can only scrape # when = self._tz.localize(when) bills = self.scrape_bills(when_and_where) description = 'Committee Meeting: %s' % committee event = Event(session, when, 'committee:meeting', description, location=location) event.add_source(url) event.add_participant('host', committee, 'committee', chamber='lower') event.add_document("Agenda", guid, type='agenda', mimetype="application/pdf") for bill in bills: event.add_related_bill(bill, description=when_and_where, type='consideration') event['link'] = guid self.save_event(event)
def scrape(self, session, chambers): page = self.lxmlize(calurl) events = page.xpath("//table[@class='agenda-body']//tr")[1:] for event in events: comit_url = event.xpath( ".//a[contains(@href, '/Pages/comm-info.aspx?c=')]") if len(comit_url) != 1: raise Exception comit_url = comit_url[0] who = self.scrape_participants(session, comit_url.attrib['href']) tds = event.xpath("./*") date = tds[0].text_content().strip() cttie = tds[1].text_content().strip() cttie_chamber, cttie = [x.strip() for x in cttie.split(" - ", 1)] info = tds[2] name = info.xpath("./a[contains(@href, 'raw')]")[0] notice = name.attrib['href'] name = name.text time, where = info.xpath("./i/text()") what = tds[3].text_content() what = what.replace("Items: ", "") if "(None)" in what: continue what = [x.strip() for x in what.split(";")] when = ", ".join([date, str(dt.datetime.now().year), time]) when = dt.datetime.strptime(when, "%a %b %d, %Y, %I:%M %p") event = Event(session, when, 'committee:meeting', name, location=where, link=notice) event.add_source(calurl) event.add_participant('host', cttie, 'committee', chamber=cttie_chamber) event.add_document("notice", notice, mimetype='application/pdf') for thing in who: event.add_participant(thing['title'], thing['name'], 'legislator', chamber=cttie_chamber) self.save_event(event)
def scrape_house_weekly_schedule(self, session): url = "http://house.louisiana.gov/H_Sched/Hse_Sched_Weekly.htm" page = self.urlopen(url) page = lxml.html.fromstring(page) page.make_links_absolute(url) for link in page.xpath("//img[@alt = 'See Agenda in pdf']/.."): try: guid = link.attrib['href'] except KeyError: continue # Sometimes we have a dead link. This is only on # dead entries. committee = link.xpath("string(../../../td[1])").strip() when_and_where = link.xpath("string(../../../td[2])").strip() location = when_and_where.split(',')[-1] if when_and_where.strip() == "": continue year = datetime.datetime.now().year when = parse_datetime(when_and_where, year) # We can only scrape # current year's events in LA. bills = self.scrape_bills(when_and_where) description = 'Committee Meeting: %s' % committee event = Event(session, when, 'committee:meeting', description, location=location) event.add_source(url) event.add_participant('host', committee, 'committee', chamber='lower') event.add_document("Agenda", guid, type='agenda', mimetype="application/pdf") for bill in bills: event.add_related_bill(bill, description=when_and_where, type='consideration') event['link'] = guid self.save_event(event)
def test_event(): e = Event('S1', datetime.datetime(2012, 1, 1), 'meeting', 'event description', 'event location') e.add_document('agenda', 'http://example.com/event/agenda.txt') e.add_related_bill('HB 1', relation='considered') assert_equal(e['documents'], [{ 'name': 'agenda', 'url': 'http://example.com/event/agenda.txt', 'type': 'other' }]) assert_equal(e['related_bills'], [{ 'bill_id': 'HB 1', 'relation': 'considered' }])
def scrape(self, session, chambers): EVENTS_URL = 'http://www.akleg.gov/basis/Meeting/Find' events = self.lxmlize(EVENTS_URL).xpath( '//ul[@id="meetingResults"]/li') for info in events: event_url = info.xpath('span[@class="col04"]/a/@href')[0] doc = self.lxmlize(event_url) # Skip events that are placeholders or tentative # Also skip whole-chamber events if any(x.strip().startswith("No Meeting") for x in doc.xpath('//div[@class="schedule"]//text()')) \ or "session" in \ info.xpath('span[@class="col01"]/text()')[0].lower(): continue event = Event( session=session, when=self._TZ.localize(datetime.datetime.strptime( info.xpath('span[@class="col02"]/text()')[0], self._DATETIME_FORMAT )), type='committee:meeting', description=" ".join(x.strip() for x in doc.xpath('//div[@class="schedule"]//text()') if x.strip()), location=doc.xpath( '//div[@class="heading-container"]/span/text()') [0].title() ) event.add_participant( type='host', participant=info.xpath( 'span[@class="col01"]/text()')[0].title(), participant_type='committee' ) for document in doc.xpath('//td[@data-label="Document"]/a'): event.add_document( name=document.xpath('text()')[0], url=document.xpath('@href')[0] ) event.add_source(EVENTS_URL) event.add_source(event_url.replace(" ", "%20")) self.save_event(event)
def scrape_house_weekly_schedule(self, session): url = "http://house.louisiana.gov/H_Sched/Hse_Sched_Weekly.htm" page = self.urlopen(url) page = lxml.html.fromstring(page) page.make_links_absolute(url) for link in page.xpath("//img[@alt = 'See Agenda in pdf']/.."): try: guid = link.attrib['href'] except KeyError: continue # Sometimes we have a dead link. This is only on # dead entries. committee = link.xpath("string(../../td[1])").strip() when_and_where = link.xpath("string(../../td[2])").strip() location = when_and_where.split(',')[-1] if when_and_where.strip() == "": continue year = datetime.datetime.now().year when = parse_datetime(when_and_where, year) # We can only scrape # current year's events in LA. bills = self.scrape_bills(when_and_where) description = 'Committee Meeting: %s' % committee event = Event(session, when, 'committee:meeting', description, location=location) event.add_source(url) event.add_participant('host', committee, 'committee', chamber='lower') event.add_document("Agenda", guid, type='agenda', mimetype="application/pdf") for bill in bills: event.add_related_bill(bill, description=when_and_where, type='consideration') event['link'] = guid self.save_event(event)
def scrape(self, session, chambers): URL = 'http://utahlegislature.granicus.com/ViewPublisherRSS.php?view_id=2&mode=agendas' doc = self.lxmlize(URL) events = doc.xpath('//item') for info in events: (title, when) = info.xpath('title/text()')[0].split(" - ") if not when.endswith(session[ :len("20XX")]): continue event = Event( session=session, when=datetime.datetime.strptime(when, '%b %d, %Y'), type='committee:meeting', description=title, location='State Capitol' ) event.add_source(URL) url = re.search(r'(http://.*?)\s', info.text_content()).group(1) doc = self.lxmlize(url) event.add_source(url) committee = doc.xpath('//a[text()="View committee page"]/@href') if committee: committee_doc = self.lxmlize(committee[0]) committee_name = committee_doc.xpath( '//h3[@class="heading committee"]/text()')[0].strip() event.add_participant( type='host', participant=committee_name, participant_type='committee' ) documents = doc.xpath('.//td') for document in documents: event.add_document( name=document.xpath('text()')[0], url=re.search(r'(http://.*?pdf)', document.xpath('@onclick')[0]).group(1), mimetype='application/pdf' ) self.save_event(event)
def scrape(self, session, chambers): URL = 'http://utahlegislature.granicus.com/ViewPublisherRSS.php?view_id=2&mode=agendas' doc = self.lxmlize(URL) events = doc.xpath('//item') for info in events: title_and_date = info.xpath('title/text()')[0].split(" - ") title = title_and_date[0] when = title_and_date[-1] if not when.endswith(session[ :len("20XX")]): continue event = Event( session=session, when=datetime.datetime.strptime(when, '%b %d, %Y'), type='committee:meeting', description=title, location='State Capitol' ) event.add_source(URL) url = re.search(r'(http://.*?)\s', info.text_content()).group(1) try: doc = self.lxmlize(url) except HTTPError: self.logger.warning("Page missing, skipping") continue event.add_source(url) committee = doc.xpath('//a[text()="View committee page"]/@href') if committee: committee_doc = self.lxmlize(committee[0]) committee_name = committee_doc.xpath( '//h3[@class="heading committee"]/text()')[0].strip() if committee_name.lower().startswith("Senate"): chamber = "upper" elif committee_name.lower().startswith("House"): chamber = "lower" else: chamber = "joint" event.add_participant( type='host', participant=committee_name, participant_type='committee', chamber = chamber ) documents = doc.xpath('.//td') for document in documents: url = re.search(r'(http://.*?pdf)', document.xpath('@onclick')[0]) if url is None: continue url = url.group(1) event.add_document( name=document.xpath('text()')[0], url=url, mimetype='application/pdf' ) bills = document.xpath('@onclick') for bill in bills: if "bills/static" in bill: bill_name = bill.split("/")[-1].split(".")[0] event.add_related_bill(bill_name, type='consideration', description='Bill up for discussion') self.save_event(event)
def scrape_agenda(self, url, session): page = self.lxmlize(url) # Get the date/time info: date_time = page.xpath("//table[@class='time_place']") if date_time == []: return date_time = date_time[0] lines = date_time.xpath("./tr") metainf = {} for line in lines: tds = line.xpath("./td") metainf[tds[0].text_content()] = tds[1].text_content() date = metainf['DATE:'] time = metainf['TIME:'] where = metainf['PLACE:'] fmts = [ "%A, %B %d, %Y", "%A, %B %d, %Y %I:%M %p", "%A, %B %d, %Y %I:%M", ] if time in all_day: datetime = date else: datetime = "%s %s" % ( date, time ) if "CANCELLED" in datetime or "Rise of the House" in datetime: # XXX: Do something more advanced. return transtable = { "P.M" : "PM", "PM." : "PM", "P.M." : "PM", "A.M." : "AM", "POSTPONED" : "", "RESCHEDULED": "", "and Rise of the Senate": "", } for trans in transtable: datetime = datetime.replace(trans, transtable[trans]) datetime = datetime.strip() for fmt in fmts: try: datetime = dt.datetime.strptime(datetime, fmt) break except ValueError: continue event = Event(session, datetime, 'committee:meeting', 'Meeting Notice', location=where) event.add_source(url) # aight. Let's get us some bills! bills = page.xpath("//b/a") for bill in bills: bill_ft = bill.attrib['href'] event.add_document(bill.text_content(), bill_ft, type="full-text", mimetype="application/pdf") root = bill.xpath('../../*') root = [ x.text_content() for x in root ] bill_id = "".join(root) if "SCHEDULED FOR" in bill_id: continue descr = bill.getparent().getparent().getparent().getnext().getnext( ).text_content() for thing in replace: bill_id = bill_id.replace(thing, replace[thing]) event.add_related_bill(bill_id, description=descr, type='consideration') committee = page.xpath("//span[@id='lblSession']")[0].text_content() chambers = { "house" : "lower", "joint" : "joint", "senate" : "upper" } chamber = "other" for key in chambers: if key in committee.lower(): chamber = chambers[key] event.add_participant("host", committee, 'committee', chamber=chamber) self.save_event(event)
def scrape(self, chamber, session): if chamber == 'other': return calendar_url = ("http://legisweb.state.wy.us/%s/Calendar/" "CalendarMenu/CommitteeMenu.aspx" % str(session)) page = self.get_page_from_url(calendar_url) rows = page.xpath('//table[@id="ctl00_cphContent_gvCalendars"]/tr') for i, row in enumerate(rows): row_ident = '%02d' % (i + 2) date_xpath = ('.//span[@id="ctl00_cphContent_gv' 'Calendars_ctl%s_lblDate"]' % str(row_ident)) date_string = row.xpath(date_xpath)[0].text_content() chamber_char = self.metadata['chambers'][ chamber]['name'][0].upper() meeting_xpath = ('.//a[@id="ctl00_cphContent_gv' 'Calendars_ctl%s_hl%scallink"]' % ( str(row_ident), chamber_char )) meeting_url = row.xpath(meeting_xpath) if (len(meeting_url) == 1 and meeting_url[0].text_content().strip() != ''): meeting_url = meeting_url[0].attrib['href'] meeting_page = self.get_page_from_url(meeting_url) meetings = meeting_page.xpath( './/table[@class="MsoNormalTable"]/tr') meeting_idents = [] meeting_ident = 0 # breaking the meetings into arrays (meeting_data) for # processing. meeting_ident is the first row of the meeting # (time, committee, location) for meeting in meetings: if self.is_row_a_new_meeting(meeting): meeting_idents.append(meeting_ident) meeting_ident += 1 for i, meeting_ident in enumerate(meeting_idents): if len(meeting_idents) == 1 or i + 1 == len(meeting_idents): ident_start, ident_end = [meeting_ident, 0] meeting_data = meetings[ident_start:] else: ident_start, ident_end = [ meeting_ident, meeting_idents[i + 1] - 1 ] if ident_end - ident_start == 1: ident_end = ident_start + 2 meeting_data = meetings[ident_start:ident_end] committee = self.get_committee(meeting_data) meeting_time = self.get_meeting_time(meeting_data) meeting_date_time = datetime.datetime.strptime( date_string + ' ' + meeting_time, '%m/%d/%Y %I:%M %p') meeting_date_time = self._tz.localize(meeting_date_time) location = self.get_location(meeting_data) description = self.get_meeting_description(meeting_data) bills = self.get_bills(meeting_data) if description == '': description = committee event = Event( session, meeting_date_time, 'committee:meeting', description, location ) event.add_source(meeting_url) for bill in bills: if bill['bill_description'] == '': bill['bill_description'] = committee event.add_related_bill( bill_id=bill['bill_id'], description=bill['bill_description'], type='consideration' ) event.add_document( name=bill['bill_id'], url=bill['bill_url'], type='bill', mimetype='application/pdf' ) event.add_participant( type='host', participant=committee, participant_type='committee', chamber=chamber ) self.save_event(event)
def scrape(self, chamber, session): if chamber == 'other': return calendar_url = ("http://legisweb.state.wy.us/%s/Calendar/" "CalendarMenu/CommitteeMenu.aspx" % str(session)) page = self.lxmlize(calendar_url) rows = page.xpath('//table[@id="ctl00_cphContent_gvCalendars"]/tr') for i, row in enumerate(rows): row_ident = '%02d' % (i + 2) date_xpath = ('.//span[@id="ctl00_cphContent_gv' 'Calendars_ctl%s_lblDate"]' % str(row_ident)) date_string = row.xpath(date_xpath)[0].text_content() chamber_char = self.metadata['chambers'][chamber]['name'][0].upper( ) meeting_xpath = ('.//a[@id="ctl00_cphContent_gv' 'Calendars_ctl%s_hl%scallink"]' % (str(row_ident), chamber_char)) meeting_url = row.xpath(meeting_xpath) if (len(meeting_url) == 1 and meeting_url[0].text_content().strip() != ''): try: meeting_url = meeting_url[0].attrib['href'] except KeyError: self.warning("Alleged meeting date has no URL: " + meeting_url[0].text_content().strip()) continue meeting_page = self.lxmlize(meeting_url) meetings = meeting_page.xpath( './/table[@class="MsoNormalTable"]/tr') meeting_idents = [] meeting_ident = 0 # breaking the meetings into arrays (meeting_data) for # processing. meeting_ident is the first row of the meeting # (time, committee, location) for meeting in meetings: if self.is_row_a_new_meeting(meeting): meeting_idents.append(meeting_ident) meeting_ident += 1 for i, meeting_ident in enumerate(meeting_idents): if len(meeting_idents) == 1 or i + 1 == len( meeting_idents): ident_start, ident_end = [meeting_ident, 0] meeting_data = meetings[ident_start:] else: ident_start, ident_end = [ meeting_ident, meeting_idents[i + 1] - 1 ] if ident_end - ident_start == 1: ident_end = ident_start + 2 meeting_data = meetings[ident_start:ident_end] committee = self.get_committee(meeting_data) meeting_time = self.get_meeting_time(meeting_data) meeting_date_time = datetime.datetime.strptime( date_string + ' ' + meeting_time, '%m/%d/%Y %I:%M %p') meeting_date_time = self._tz.localize(meeting_date_time) location = self.get_location(meeting_data) description = self.get_meeting_description(meeting_data) bills = self.get_bills(meeting_data) if description == '': description = committee event = Event(session, meeting_date_time, 'committee:meeting', description, location) event.add_source(meeting_url) for bill in bills: if bill['bill_description'] == '': bill['bill_description'] = committee event.add_related_bill( bill_id=bill['bill_id'], description=bill['bill_description'], type='consideration') event.add_document(name=bill['bill_id'], url=bill['bill_url'], type='bill', mimetype='application/pdf') event.add_participant(type='host', participant=committee, participant_type='committee', chamber=chamber) self.save_event(event)
def scrape(self, chamber, session): if chamber == "other": return calendar_url = "http://legisweb.state.wy.us/%s/Calendar/" "CalendarMenu/CommitteeMenu.aspx" % str(session) page = self.lxmlize(calendar_url) rows = page.xpath('//table[@id="ctl00_cphContent_gvCalendars"]/tr') for i, row in enumerate(rows): row_ident = "%02d" % (i + 2) date_xpath = './/span[@id="ctl00_cphContent_gv' 'Calendars_ctl%s_lblDate"]' % str(row_ident) date_string = row.xpath(date_xpath)[0].text_content() chamber_char = self.metadata["chambers"][chamber]["name"][0].upper() meeting_xpath = './/a[@id="ctl00_cphContent_gv' 'Calendars_ctl%s_hl%scallink"]' % ( str(row_ident), chamber_char, ) meeting_url = row.xpath(meeting_xpath) if len(meeting_url) == 1 and meeting_url[0].text_content().strip() != "": try: meeting_url = meeting_url[0].attrib["href"] except KeyError: self.warning("Alleged meeting date has no URL: " + meeting_url[0].text_content().strip()) continue meeting_page = self.lxmlize(meeting_url) meetings = meeting_page.xpath('.//table[@class="MsoNormalTable"]/tr') meeting_idents = [] meeting_ident = 0 # breaking the meetings into arrays (meeting_data) for # processing. meeting_ident is the first row of the meeting # (time, committee, location) for meeting in meetings: if self.is_row_a_new_meeting(meeting): meeting_idents.append(meeting_ident) meeting_ident += 1 for i, meeting_ident in enumerate(meeting_idents): if len(meeting_idents) == 1 or i + 1 == len(meeting_idents): ident_start, ident_end = [meeting_ident, 0] meeting_data = meetings[ident_start:] else: ident_start, ident_end = [meeting_ident, meeting_idents[i + 1] - 1] if ident_end - ident_start == 1: ident_end = ident_start + 2 meeting_data = meetings[ident_start:ident_end] committee = self.get_committee(meeting_data) meeting_time = self.get_meeting_time(meeting_data) meeting_date_time = datetime.datetime.strptime( date_string + " " + meeting_time, "%m/%d/%Y %I:%M %p" ) meeting_date_time = self._tz.localize(meeting_date_time) location = self.get_location(meeting_data) description = self.get_meeting_description(meeting_data) bills = self.get_bills(meeting_data) if description == "": description = committee event = Event(session, meeting_date_time, "committee:meeting", description, location) event.add_source(meeting_url) for bill in bills: if bill["bill_description"] == "": bill["bill_description"] = committee event.add_related_bill( bill_id=bill["bill_id"], description=bill["bill_description"], type="consideration" ) event.add_document( name=bill["bill_id"], url=bill["bill_url"], type="bill", mimetype="application/pdf" ) event.add_participant( type="host", participant=committee, participant_type="committee", chamber=chamber ) self.save_event(event)
def scrape(self, session, chambers): hansard_urls = {'39th1st': 'http://www.leg.bc.ca/hansard/39th1st/index.htm', '39th2nd': 'http://www.leg.bc.ca/hansard/39th2nd/index.htm', '39th3rd': 'http://www.leg.bc.ca/hansard/39th3rd/index.htm', '39th4th': 'http://www.leg.bc.ca/hansard/8-8.htm'} url = hansard_urls[session] page = self.lxmlize(url) for row in page.xpath("//table/tr"): hansard_id = row.xpath(".//td[@align='left']") ids = row.xpath(".//td[@align='left']/p") web_links = row.xpath(".//a[contains(text(), 'HTML')]") pdf_links = row.xpath(".//a[contains(text(), 'PDF')]") if web_links == [] and pdf_links == []: continue if ids == []: continue if len(web_links) != 1: continue # XXX: Bug, deal with me! We sometimes get a ton # of unwanted hansard. Some of the xpath must be wrong. ids = ids[-1] date = ids.text.strip() hansard_id = ids.xpath(".//br")[0].tail hansard_id = re.sub("\s+", " ", hansard_id).strip() if date == "": continue times_of_day = ["Morning", "Afternoon"] time_of_day = None for time in times_of_day: if date.endswith(time): date = date.rstrip(", %s" % (time)) time_of_day = time when = dt.datetime.strptime(date, "%A, %B %d, %Y") event = Event( session, when, 'cow:meeting', "%s session on %s" % ( time_of_day, date ) if time_of_day else "Session on %s" % (date), location='Parliament Buildings', record_id=hansard_id # Official record's ID for speeches. ) for x in web_links: event.add_document(x.text_content(), x.attrib['href'], type="transcript", mimetype="text/html") for x in pdf_links: event.add_document(x.text_content(), x.attrib['href'], type="transcript", mimetype="application/pdf") event.add_source(url) self.save_object(event) for a in web_links: self.scrape_hansard(session, 'lower', a.attrib['href'], hansard_id)
def scrape_agenda(self, url, session): page = self.lxmlize(url) # Get the date/time info: date_time = page.xpath("//table[@class='time_place']") if date_time == []: return date_time = date_time[0] lines = date_time.xpath("./tr") metainf = {} for line in lines: tds = line.xpath("./td") metainf[tds[0].text_content()] = tds[1].text_content() date = metainf['DATE:'] time = metainf['TIME:'] where = metainf['PLACE:'] fmts = [ "%A, %B %d, %Y", "%A, %B %d, %Y %I:%M %p", "%A, %B %d, %Y %I:%M", ] if time in all_day: datetime = date else: datetime = "%s %s" % (date, time) if "CANCELLED" in datetime: # XXX: Do something more advanced. return transtable = { "P.M": "PM", "PM.": "PM", "P.M.": "PM", "A.M.": "AM", "POSTPONED": "", "RESCHEDULED": "", "and Rise of the Senate": "", } for trans in transtable: datetime = datetime.replace(trans, transtable[trans]) datetime = datetime.strip() for fmt in fmts: try: datetime = dt.datetime.strptime(datetime, fmt) break except ValueError: continue event = Event(session, datetime, 'committee:meeting', 'Meeting Notice', location=where) event.add_source(url) # aight. Let's get us some bills! bills = page.xpath("//b/a") for bill in bills: bill_ft = bill.attrib['href'] event.add_document(bill.text_content(), bill_ft, type="full-text", mimetype="application/pdf") root = bill.xpath('../../*') root = [x.text_content() for x in root] bill_id = "".join(root) if "SCHEDULED FOR" in bill_id: continue descr = bill.getparent().getparent().getparent().getnext().getnext( ).text_content() for thing in replace: bill_id = bill_id.replace(thing, replace[thing]) event.add_related_bill(bill_id, description=descr, type='consideration') committee = page.xpath("//span[@id='lblSession']")[0].text_content() chambers = {"house": "lower", "joint": "joint", "senate": "upper"} chamber = "other" for key in chambers: if key in committee.lower(): chamber = chambers[key] event.add_participant("host", committee, 'committee', chamber=chamber) self.save_event(event)