def scrape_meeting_notice(self, chamber, item, url): # Since Event Name is not provided for all mettings. event_name = str(item['CommitteeName']) # 04/25/2012 03:00:00 PM fmt = "%m/%d/%y %I:%M %p" start_time = dt.datetime.strptime(str(item['MeetingDateTime']), fmt) location_name = str(item['AddressAliasNickname']) event = Event(location_name=location_name, start_date=self._tz.localize(start_time), name=event_name, description='Committee Meeting Status: {}'.format( item['CommitteeMeetingStatusName'])) event.add_source(url) event.add_committee(name=str(item['CommitteeName']), id=item['CommitteeId']) page_url = ("http://legis.delaware.gov/json/MeetingNotice/" "GetCommitteeMeetingItems?committeeMeetingId={}".format( item['CommitteeMeetingId'])) event.add_source(page_url) page_data = self.post(page_url).json()['Data'] for item in page_data: event.add_agenda_item(description=str(item['ItemDescription'])) event.add_person(name=str(item['PrimarySponsorShortName']), id=str(item['PrimarySponsorPersonId']), note='Sponsor') yield event
def scrape_meeting_notice(self, chamber, item, url): # Since Event Name is not provided for all mettings. event_name = str(item['CommitteeName']) # 04/25/2012 03:00:00 PM fmt = "%m/%d/%y %I:%M %p" start_time = dt.datetime.strptime(str(item['MeetingDateTime']), fmt) location_name = str(item['AddressAliasNickname']) event = Event(location_name=location_name, start_date=self._tz.localize(start_time), name=event_name, description='Committee Meeting Status: {}' .format(item['CommitteeMeetingStatusName']) ) event.add_source(url) event.add_committee(name=str(item['CommitteeName']), id=item['CommitteeId']) page_url = ("http://legis.delaware.gov/json/MeetingNotice/" "GetCommitteeMeetingItems?committeeMeetingId={}".format( item['CommitteeMeetingId']) ) event.add_source(page_url) page_data = self.post(page_url).json()['Data'] for item in page_data: event.add_agenda_item(description=str(item['ItemDescription'])) event.add_person(name=str(item['PrimarySponsorShortName']), id=str(item['PrimarySponsorPersonId']), note='Sponsor') yield event
def test_full_event(): j = Jurisdiction.objects.create(id='jid', division_id='did') event = ScrapeEvent(name="America's Birthday", start_time="2014-07-04", location="America", all_day=True) event.add_person("George Washington") event.add_media_link("fireworks", "http://example.com/fireworks.mov") EventImporter('jid').import_data([event.as_dict()])
def ge(): event = ScrapeEvent(name="America's Birthday", start_time="2014-07-04T05:00Z", location_name="America", timezone="America/New_York", all_day=True) event.add_person("George Washington") return event
def ge(): event = ScrapeEvent( name="America's Birthday", start_time="2014-07-04T05:00Z", location_name="America", timezone="America/New_York", all_day=True) event.add_person("George Washington") return event
def scrape(self): page = self.lxmlize(calurl) events = page.xpath("//table[@class='agenda-body']//tr")[1:] for event in events: comit_url = event.xpath( ".//a[contains(@href, '/Pages/comm-info.aspx?c=')]") if len(comit_url) != 1: raise Exception comit_url = comit_url[0] who = self.scrape_participants(comit_url.attrib['href']) tds = event.xpath("./*") date = tds[0].text_content().strip() cttie = tds[1].text_content().strip() _chamber, cttie = [x.strip() for x in cttie.split(" - ", 1)] info = tds[2] name = info.xpath("./a[contains(@href, 'raw')]")[0] notice = name.attrib['href'] name = name.text time, where = info.xpath("./i/text()") what = tds[3].text_content() what = what.replace("Items: ", "") if "(None)" in what: continue what = [x.strip() for x in what.split(";")] when = ", ".join([date, str(dt.datetime.now().year), time]) when = dt.datetime.strptime(when, "%a %b %d, %Y, %I:%M %p") event = Event( name=name, location_name=where, start_date=self._tz.localize(when), ) event.add_source(calurl) event.add_committee(cttie, note='host') event.add_document("notice", notice, media_type='application/pdf') for entry in what: item = event.add_agenda_item(entry) if entry.startswith('AB') or entry.startswith('SB'): item.add_bill(entry) for thing in who: event.add_person(thing['name']) yield event
def scrape(self): page = self.lxmlize(calurl) events = page.xpath("//table[@class='agenda-body']//tr")[1:] for event in events: comit_url = event.xpath( ".//a[contains(@href, '/Pages/comm-info.aspx?c=')]") if len(comit_url) != 1: raise Exception comit_url = comit_url[0] who = self.scrape_participants(comit_url.attrib['href']) tds = event.xpath("./*") date = tds[0].text_content().strip() cttie = tds[1].text_content().strip() _chamber, cttie = [x.strip() for x in cttie.split(" - ", 1)] info = tds[2] name = info.xpath("./a[contains(@href, 'raw')]")[0] notice = name.attrib['href'] name = name.text time, where = info.xpath("./i/text()") what = tds[3].text_content() what = what.replace("Items: ", "") if "(None)" in what: continue what = [x.strip() for x in what.split(";")] when = ", ".join([date, str(dt.datetime.now().year), time]) when = dt.datetime.strptime(when, "%a %b %d, %Y, %I:%M %p") event = Event( name=name, location_name=where, start_date=self._tz.localize(when), ) event.add_source(calurl) event.add_committee(cttie, note='host') event.add_document("notice", notice, media_type='application/pdf') for entry in what: item = event.add_agenda_item(entry) if entry.startswith('AB') or entry.startswith('SB'): item.add_bill(entry) for thing in who: event.add_person(thing['name']) yield event
def scrape_chamber(self, chamber): grouped_hearings = defaultdict(list) for hearing in self.session.query(CACommitteeHearing): location = (self.session.query(CALocation).filter_by( location_code=hearing.location_code)[0].description) date = self._tz.localize(hearing.hearing_date) chamber_abbr = location[0:3] event_chamber = {"Asm": "lower", "Sen": "upper"}[chamber_abbr] if event_chamber != chamber: continue grouped_hearings[(location, date)].append(hearing) for ((location, date), hearings) in grouped_hearings.items(): # Get list of bill_ids from the database. bill_ids = [hearing.bill_id for hearing in hearings] bills = [ "%s %s" % re.match(r"\d+([^\d]+)(\d+)", bill).groups() for bill in bill_ids ] # Dereference the committee_nr number and get display name. msg = "More than one committee meeting at (location, date) %r" msg = msg % ((location, date), ) assert len(set(hearing.committee_nr for hearing in hearings)) == 1, msg committee_name = _committee_nr[hearings.pop().committee_nr] desc = "Committee Meeting: " + committee_name event = Event(name=desc, start_date=date, location_name=committee_name) for bill_id in bills: if "B" in bill_id: type_ = "bill" else: type_ = "resolution" item = event.add_agenda_item("consideration") item.add_bill(bill_id, note=type_) event.add_person(committee_name + " Committee", note="host") event.add_source("https://downloads.leginfo.legislature.ca.gov/") yield event
def scrape_chamber(self, chamber): grouped_hearings = defaultdict(list) for hearing in self.session.query(CACommitteeHearing): location = self.session.query(CALocation).filter_by( location_code=hearing.location_code)[0].description date = self._tz.localize(hearing.hearing_date) chamber_abbr = location[0:3] event_chamber = {'Asm': 'lower', 'Sen': 'upper'}[chamber_abbr] if event_chamber != chamber: continue grouped_hearings[(location, date)].append(hearing) for ((location, date), hearings) in grouped_hearings.items(): # Get list of bill_ids from the database. bill_ids = [hearing.bill_id for hearing in hearings] bills = ["%s %s" % re.match(r'\d+([^\d]+)(\d+)', bill).groups() for bill in bill_ids] # Dereference the committee_nr number and get display name. msg = 'More than one committee meeting at (location, date) %r' msg = msg % ((location, date),) assert len(set(hearing.committee_nr for hearing in hearings)) == 1, msg committee_name = _committee_nr[hearings.pop().committee_nr] desc = 'Committee Meeting: ' + committee_name event = Event( name=desc, start_date=date, location_name=committee_name, ) for bill_id in bills: if 'B' in bill_id: type_ = 'bill' else: type_ = 'resolution' item = event.add_agenda_item('consideration') item.add_bill(bill_id, note=type_) event.add_person(committee_name + ' Committee', note='host') event.add_source('ftp://www.leginfo.ca.gov/pub/bill/') yield event
def get_events(self): "http://app.toronto.ca/tmmis/getAdminReport.do?function=prepareMeetingScheduleReport" "http://app.toronto.ca/tmmis/getAdminReport.do?function=prepareMemberAttendanceReport" # scrape attendance tmpdir = tempfile.mkdtemp() page = self.lxmlize("http://app.toronto.ca/tmmis/getAdminReport.do?function=prepareMemberAttendanceReport") members = page.xpath('//td[@class="inputText"]/select[@name="memberId"]/option') for member in members: post = { "function": "getMemberAttendanceReport", "download": "csv", "exportPublishReportId": 1, "termId": 4, "memberId": member.attrib["value"], "decisionBodyId": 0, } r = self.post("http://app.toronto.ca/tmmis/getAdminReport.do", data=post) if r.headers["content-type"] != "application/vnd.ms-excel": continue attendance_file = open(tmpdir + "/" + member.text + ".csv", "w") attendance_file.write(r.text) attendance_file.close() # scrape events post = { "function": "getMeetingScheduleReport", "download": "csv", "exportPublishReportId": 3, "termId": 4, "decisionBodyId": 0, } r = self.post("http://app.toronto.ca/tmmis/getAdminReport.do", data=post) empty = [] meeting_file = open("meetings.csv", "w") meeting_file.write(r.text) meeting_file.close() with open("meetings.csv", "rb") as csvfile: csvfile = csv.reader(csvfile, delimiter=",") next(csvfile) committee = "" agenda_items = [] for row in csvfile: name = row[0] when = row[2] when = dt.datetime.strptime(when, "%Y-%m-%d") location = row[5] if name != committee: committee = name agenda_items = find_items(committee) e = Event(name=name, session=self.session, when=when, location=location) attendees = find_attendees(tmpdir, row) if len(attendees) == 0: empty.append(row) for attendee in find_attendees(tmpdir, row): e.add_person(attendee) e.add_source("http://app.toronto.ca/tmmis/getAdminReport.do?function=prepareMeetingScheduleReport") for item in agenda_items: if item["date"].date() == when.date(): i = e.add_agenda_item(item["description"]) i.add_committee(committee) i["order"] = item["order"] for link in item["links"]: i.add_media_link(link["name"], link["url"], on_duplicate="ignore") if "notes" in item: i["notes"] = [item["notes"]] yield e shutil.rmtree(tmpdir) os.remove("meetings.csv")
def get_events(self): "http://app.toronto.ca/tmmis/getAdminReport.do?function=prepareMeetingScheduleReport" "http://app.toronto.ca/tmmis/getAdminReport.do?function=prepareMemberAttendanceReport" # scrape attendance tmpdir = tempfile.mkdtemp() page = self.lxmlize( "http://app.toronto.ca/tmmis/getAdminReport.do?function=prepareMemberAttendanceReport" ) members = page.xpath( '//td[@class="inputText"]/select[@name="memberId"]/option') for member in members: post = { 'function': 'getMemberAttendanceReport', 'download': 'csv', 'exportPublishReportId': 1, 'termId': 4, 'memberId': member.attrib['value'], 'decisionBodyId': 0, } r = self.post("http://app.toronto.ca/tmmis/getAdminReport.do", data=post) if r.headers['content-type'] != 'application/vnd.ms-excel': continue attendance_file = open(tmpdir + '/' + member.text + '.csv', 'w') attendance_file.write(r.text) attendance_file.close() # scrape events post = { 'function': 'getMeetingScheduleReport', 'download': 'csv', 'exportPublishReportId': 3, 'termId': 4, 'decisionBodyId': 0, } r = self.post("http://app.toronto.ca/tmmis/getAdminReport.do", data=post) empty = [] meeting_file = open('meetings.csv', 'w') meeting_file.write(r.text) meeting_file.close() with open('meetings.csv', 'rb') as csvfile: csvfile = csv.reader(csvfile, delimiter=',') next(csvfile) committee = '' agenda_items = [] for row in csvfile: name = row[0] when = row[2] when = dt.datetime.strptime(when, "%Y-%m-%d") location = row[5] if name != committee: committee = name agenda_items = find_items(committee) e = Event(name=name, session=self.session, when=when, location=location) attendees = find_attendees(tmpdir, row) if len(attendees) == 0: empty.append(row) for attendee in find_attendees(tmpdir, row): e.add_person(attendee) e.add_source( "http://app.toronto.ca/tmmis/getAdminReport.do?function=prepareMeetingScheduleReport" ) for item in agenda_items: if item['date'].date() == when.date(): i = e.add_agenda_item(item['description']) i.add_committee(committee) i['order'] = item['order'] for link in item['links']: i.add_media_link(link['name'], link['url'], on_duplicate='ignore') if 'notes' in item: i['notes'] = [item['notes']] yield e shutil.rmtree(tmpdir) os.remove('meetings.csv')
def scrape(self): for c in senate_base: m = {} m['notice'] = c.xpath('.//p/span[@class="cal_special"]/text()') link = c.xpath('.//h3/a/@href') print('top link: ', c.xpath('.//h3/*')) if len(link) > 0: m['link'] = c.xpath('.//h3/a/@href')[0] m['title'] = c.xpath('.//h3/a/text()')[0] else: m['link'] = 'https://www.leg.state.mn.us/cal?type=all' m['title'] = c.xpath('.//h3/text()')[0] print('top link 2: ', c.xpath('.//h3/text()')) info_div = c.xpath('.//div[@class="calendar_p_indent"]') if len(info_div) > 0: info_div = info_div[0] info_list = info_div.xpath('.//text()') nchairs = [] agenda = False for il in info_list: il = il.replace('\xa0', '') if il.startswith(' and '): il = il.replace(' and ', '') if il.startswith('Room'): m['room'] = il if il.startswith('Rep.') or il.startswith('Sen.'): cname = pull_middle_name(il[4:]) nchairs.append(cname.strip()) if agenda == True: m['agenda'] = il if il == 'Agenda: ': agenda = True m['chair'] = nchairs if len(m['notice']) > 0: m['notice'] = m['notice'][0] else: m['notice'] = 'N/A' ppr(m) date = c.xpath('.//p/span/text()') if len(date) < 1: print('\n\n\n\n NO DATE') ppr(m) continue if 'or' in date[0]: date[0] = date[0].split('or')[0] m['date'] = datetime.datetime.strptime(date[0].replace('\xa0', ''), format1) ppr(m) if not 'room' in m.keys(): print('oops') m['room'] = 'Senate in session' event = Event(name=m['title'], start_date=tz.localize(m['date']), location_name=m['room']) if len(m['notice']) > 0: pass event.add_committee(m['title']) event.add_source(m['link']) for chair in m['chair']: event.add_person(name=chair, note="Chair") yield event
def scrape(self): for c in comm_base: print(c.xpath('.//h3/a/text()')) for c in comm_base: m = {} m['notice'] = c.xpath('.//p/span[@class="cal_special"]/text()') print(c.xpath('.//h3/*')) title = c.xpath('.//h3/a/text()') if len(title) == 0: continue else: m['title'] = title[0] m['link'] = c.xpath('.//h3/a/@href')[0] info_div = c.xpath('.//div[@class="calendar_p_indent"]')[0] print('one info div') if info_div is not None: info_list = info_div.xpath('.//text()') if info_list[0] == 'Room: ': m['room'] = info_list[1] if info_list[1] == 'Chair: ': chair = info_list[2] if ',' in chair: chairs = chair.replace('\xa0', '').split(',') nchairs = [] for chair in chairs: if chair.startswith('Rep.') or chair.startswith( 'Sen.'): cname = pull_middle_name(chair[4:]) nchairs.append(cname.strip()) m['chair'] = nchairs elif chair.startswith('Rep.') or chair.startswith('Sen.'): cname = pull_middle_name(chair[4:].strip()) m['chair'] = [cname.strip()] if info_list[2] == 'Chair: ': chair = info_list[3] if ',' in chair: chairs = chair.replace('\xa0', '').split(',') nchairs = [] for chair in chairs: if chair.startswith('Rep.') or chair.startswith( 'Sen.'): cname = pull_middle_name(chair[4:]) nchairs.append(cname.strip()) m['chair'] = nchairs elif chair.startswith('Rep.') or chair.startswith('Sen.'): cname = pull_middle_name(chair[4:].strip()) m['chair'] = [cname.strip()] if info_list[4] == 'Agenda: ': m['agenda'] = info_list[5] if len(m['notice']) > 0: m['notice'] = m['notice'][0] else: m['notice'] = 'N/A' ppr(m) date = c.xpath('.//p/b/text()') if len(date) < 1: print('\n\n\n\n NO DATE') ppr(m) continue m['date'] = datetime.datetime.strptime(date[0], format1) event = Event(name=m['title'], start_date=tz.localize(m['date']), location_name=m['room']) if len(m['notice']) > 0: pass event.add_committee(m['title']) event.add_source(m['link']) for chair in m['chair']: event.add_person(name=chair, note="Chair") yield event
def scrape(self): for c in house_base: m = {} m['notice'] = c.xpath('.//p/span[@class="cal_special"]/text()') links = c.xpath('.//h3/a/@href') if len(links) > 0: m['cmt'] = c.xpath('.//h3/a/text()')[0] m['link'] = c.xpath('.//h3/a/@href')[0] title = c.xpath('.//h3/text()')[0] if title == 'Agenda:': m['title'] = c.xpath('.//h3/a/text()')[0] else: m['title'] = c.xpath('.//h3/text()')[0] else: m['title'] = c.xpath('.//h3/text()')[0] m['link'] = None info_div = c.xpath('.//*[@class="calendar_p_indent"]') if len(info_div) == 0: pass else: info_div = info_div[0] print('Info Div: ', info_div) if len(info_div) > 0: info_list = info_div.xpath('.//text()') info_links = info_div.xpath('.//*/@href') print("info links: ", info_links) info_list = [x.replace('\n', '').strip() for x in info_list] info_list = [x for x in info_list if len(x) > 0] print('Info list: ', info_list) if info_list[0].startswith('Room:'): m['room'] = info_list[1] else: m['room'] = 'n/a' if len(info_list) > 2: if info_list[2].startswith('Chair:'): chair = info_list[3] if ',' in chair: chairs = chair.replace('\xa0', '').split(',') nchairs = [] for chair in chairs: if chair.startswith('Rep.') or chair.startswith('Sen.'): cname = pull_middle_name(chair[4:]) nchairs.append(cname.strip()) m['chair'] = nchairs elif chair.startswith('Rep.') or chair.startswith('Sen.'): cname = pull_middle_name(chair[4:].strip()) m['chair'] = [cname.strip()] else: m['chair'] = None bill_rows = c.xpath(('.//*/table[@class="cal_bills"]/tbody/tr')) print('Bills: ', bill_rows) bills = [] for brs in bill_rows: cells = brs.xpath('.//td') if len(cells) == 3: b = {} b['bill'] = cells[0].xpath('.//text()')[0] b['author'] = cells[1].xpath('./text()')[0] b['summary'] = cells[2].xpath('./text()')[0] bills.append(b) if len(m['notice']) > 0: m['notice'] = m['notice'][0] else: m['notice'] = 'N/A' date = c.xpath('.//p/b/text()') if len(date) < 1: print('\n\n\n\n NO DATE') continue m['date'] = datetime.datetime.strptime(date[0], format1) if 'House Meets in Session' in m['title']: m['room'] = 'State leg' m['cmt'] = 'Minnesota House of Representatives' m['chair'] = None m['link'] = 'https://www.leg.state.mn.us/cal?type=all' event = Event(name=m['title'], start_date=tz.localize(m['date']), location_name=m['room'] ) if len(bills) > 0: for bill in bills: nbill = event.add_agenda_item(description=bill['summary']) nbill.add_bill(bill['bill'].replace('HF', 'HF ')) if len(m['notice']) > 0: pass event.add_committee(m['cmt']) if m['link'] is not None: event.add_source(m['link']) if m['chair'] is not None: for chair in m['chair']: event.add_person(name=chair, note="Chair") yield event