Esempio n. 1
0
    def scrape_meeting_notice(self, chamber, item, url):
        # Since Event Name is not provided for all mettings.
        event_name = str(item['CommitteeName'])
        # 04/25/2012 03:00:00 PM
        fmt = "%m/%d/%y %I:%M %p"
        start_time = dt.datetime.strptime(str(item['MeetingDateTime']), fmt)
        location_name = str(item['AddressAliasNickname'])
        event = Event(location_name=location_name,
                      start_date=self._tz.localize(start_time),
                      name=event_name,
                      description='Committee Meeting Status: {}'.format(
                          item['CommitteeMeetingStatusName']))

        event.add_source(url)
        event.add_committee(name=str(item['CommitteeName']),
                            id=item['CommitteeId'])

        page_url = ("http://legis.delaware.gov/json/MeetingNotice/"
                    "GetCommitteeMeetingItems?committeeMeetingId={}".format(
                        item['CommitteeMeetingId']))

        event.add_source(page_url)
        page_data = self.post(page_url).json()['Data']
        for item in page_data:
            event.add_agenda_item(description=str(item['ItemDescription']))
            event.add_person(name=str(item['PrimarySponsorShortName']),
                             id=str(item['PrimarySponsorPersonId']),
                             note='Sponsor')

        yield event
Esempio n. 2
0
    def scrape_meeting_notice(self, chamber, item, url):
        # Since Event Name is not provided for all mettings.
        event_name = str(item['CommitteeName'])
        # 04/25/2012 03:00:00 PM
        fmt = "%m/%d/%y %I:%M %p"
        start_time = dt.datetime.strptime(str(item['MeetingDateTime']), fmt)
        location_name = str(item['AddressAliasNickname'])
        event = Event(location_name=location_name,
                      start_date=self._tz.localize(start_time),
                      name=event_name,
                      description='Committee Meeting Status: {}'
                      .format(item['CommitteeMeetingStatusName'])
                      )

        event.add_source(url)
        event.add_committee(name=str(item['CommitteeName']), id=item['CommitteeId'])

        page_url = ("http://legis.delaware.gov/json/MeetingNotice/"
                    "GetCommitteeMeetingItems?committeeMeetingId={}".format(
                        item['CommitteeMeetingId'])
                    )

        event.add_source(page_url)
        page_data = self.post(page_url).json()['Data']
        for item in page_data:
            event.add_agenda_item(description=str(item['ItemDescription']))
            event.add_person(name=str(item['PrimarySponsorShortName']),
                             id=str(item['PrimarySponsorPersonId']),
                             note='Sponsor')

        yield event
Esempio n. 3
0
def test_full_event():
    j = Jurisdiction.objects.create(id='jid', division_id='did')
    event = ScrapeEvent(name="America's Birthday", start_time="2014-07-04", location="America",
                        all_day=True)
    event.add_person("George Washington")
    event.add_media_link("fireworks", "http://example.com/fireworks.mov")

    EventImporter('jid').import_data([event.as_dict()])
Esempio n. 4
0
def ge():
    event = ScrapeEvent(name="America's Birthday",
                        start_time="2014-07-04T05:00Z",
                        location_name="America",
                        timezone="America/New_York",
                        all_day=True)
    event.add_person("George Washington")
    return event
Esempio n. 5
0
def ge():
    event = ScrapeEvent(
        name="America's Birthday",
        start_time="2014-07-04T05:00Z",
        location_name="America",
        timezone="America/New_York",
        all_day=True)
    event.add_person("George Washington")
    return event
Esempio n. 6
0
    def scrape(self):
        page = self.lxmlize(calurl)
        events = page.xpath("//table[@class='agenda-body']//tr")[1:]

        for event in events:
            comit_url = event.xpath(
                ".//a[contains(@href, '/Pages/comm-info.aspx?c=')]")

            if len(comit_url) != 1:
                raise Exception

            comit_url = comit_url[0]
            who = self.scrape_participants(comit_url.attrib['href'])

            tds = event.xpath("./*")
            date = tds[0].text_content().strip()
            cttie = tds[1].text_content().strip()
            _chamber, cttie = [x.strip() for x in cttie.split(" - ", 1)]
            info = tds[2]
            name = info.xpath("./a[contains(@href, 'raw')]")[0]
            notice = name.attrib['href']
            name = name.text
            time, where = info.xpath("./i/text()")
            what = tds[3].text_content()
            what = what.replace("Items: ", "")
            if "(None)" in what:
                continue
            what = [x.strip() for x in what.split(";")]

            when = ", ".join([date, str(dt.datetime.now().year), time])
            when = dt.datetime.strptime(when, "%a %b %d, %Y, %I:%M %p")

            event = Event(
                name=name,
                location_name=where,
                start_date=self._tz.localize(when),
            )

            event.add_source(calurl)

            event.add_committee(cttie, note='host')

            event.add_document("notice", notice, media_type='application/pdf')

            for entry in what:
                item = event.add_agenda_item(entry)
                if entry.startswith('AB') or entry.startswith('SB'):
                    item.add_bill(entry)

            for thing in who:
                event.add_person(thing['name'])

            yield event
Esempio n. 7
0
    def scrape(self):
        page = self.lxmlize(calurl)
        events = page.xpath("//table[@class='agenda-body']//tr")[1:]

        for event in events:
            comit_url = event.xpath(
                ".//a[contains(@href, '/Pages/comm-info.aspx?c=')]")

            if len(comit_url) != 1:
                raise Exception

            comit_url = comit_url[0]
            who = self.scrape_participants(comit_url.attrib['href'])

            tds = event.xpath("./*")
            date = tds[0].text_content().strip()
            cttie = tds[1].text_content().strip()
            _chamber, cttie = [x.strip() for x in cttie.split(" - ", 1)]
            info = tds[2]
            name = info.xpath("./a[contains(@href, 'raw')]")[0]
            notice = name.attrib['href']
            name = name.text
            time, where = info.xpath("./i/text()")
            what = tds[3].text_content()
            what = what.replace("Items: ", "")
            if "(None)" in what:
                continue
            what = [x.strip() for x in what.split(";")]

            when = ", ".join([date, str(dt.datetime.now().year), time])
            when = dt.datetime.strptime(when, "%a %b %d, %Y, %I:%M %p")

            event = Event(
                name=name,
                location_name=where,
                start_date=self._tz.localize(when),
            )

            event.add_source(calurl)

            event.add_committee(cttie, note='host')

            event.add_document("notice", notice, media_type='application/pdf')

            for entry in what:
                item = event.add_agenda_item(entry)
                if entry.startswith('AB') or entry.startswith('SB'):
                    item.add_bill(entry)

            for thing in who:
                event.add_person(thing['name'])

            yield event
Esempio n. 8
0
    def scrape_chamber(self, chamber):
        grouped_hearings = defaultdict(list)

        for hearing in self.session.query(CACommitteeHearing):
            location = (self.session.query(CALocation).filter_by(
                location_code=hearing.location_code)[0].description)

            date = self._tz.localize(hearing.hearing_date)

            chamber_abbr = location[0:3]
            event_chamber = {"Asm": "lower", "Sen": "upper"}[chamber_abbr]

            if event_chamber != chamber:
                continue

            grouped_hearings[(location, date)].append(hearing)

        for ((location, date), hearings) in grouped_hearings.items():

            # Get list of bill_ids from the database.
            bill_ids = [hearing.bill_id for hearing in hearings]
            bills = [
                "%s %s" % re.match(r"\d+([^\d]+)(\d+)", bill).groups()
                for bill in bill_ids
            ]

            # Dereference the committee_nr number and get display name.
            msg = "More than one committee meeting at (location, date) %r"
            msg = msg % ((location, date), )
            assert len(set(hearing.committee_nr
                           for hearing in hearings)) == 1, msg
            committee_name = _committee_nr[hearings.pop().committee_nr]

            desc = "Committee Meeting: " + committee_name
            event = Event(name=desc,
                          start_date=date,
                          location_name=committee_name)
            for bill_id in bills:
                if "B" in bill_id:
                    type_ = "bill"
                else:
                    type_ = "resolution"
                item = event.add_agenda_item("consideration")
                item.add_bill(bill_id, note=type_)

            event.add_person(committee_name + " Committee", note="host")
            event.add_source("https://downloads.leginfo.legislature.ca.gov/")

            yield event
Esempio n. 9
0
    def scrape_chamber(self, chamber):
        grouped_hearings = defaultdict(list)

        for hearing in self.session.query(CACommitteeHearing):
            location = self.session.query(CALocation).filter_by(
                location_code=hearing.location_code)[0].description

            date = self._tz.localize(hearing.hearing_date)

            chamber_abbr = location[0:3]
            event_chamber = {'Asm': 'lower', 'Sen': 'upper'}[chamber_abbr]

            if event_chamber != chamber:
                continue

            grouped_hearings[(location, date)].append(hearing)

        for ((location, date), hearings) in grouped_hearings.items():

            # Get list of bill_ids from the database.
            bill_ids = [hearing.bill_id for hearing in hearings]
            bills = ["%s %s" % re.match(r'\d+([^\d]+)(\d+)', bill).groups()
                     for bill in bill_ids]

            # Dereference the committee_nr number and get display name.
            msg = 'More than one committee meeting at (location, date) %r'
            msg = msg % ((location, date),)
            assert len(set(hearing.committee_nr for hearing in hearings)) == 1, msg
            committee_name = _committee_nr[hearings.pop().committee_nr]

            desc = 'Committee Meeting: ' + committee_name
            event = Event(
                name=desc,
                start_date=date,
                location_name=committee_name,
            )
            for bill_id in bills:
                if 'B' in bill_id:
                    type_ = 'bill'
                else:
                    type_ = 'resolution'
                item = event.add_agenda_item('consideration')
                item.add_bill(bill_id, note=type_)

            event.add_person(committee_name + ' Committee', note='host')
            event.add_source('ftp://www.leginfo.ca.gov/pub/bill/')

            yield event
Esempio n. 10
0
    def get_events(self):
        "http://app.toronto.ca/tmmis/getAdminReport.do?function=prepareMeetingScheduleReport"
        "http://app.toronto.ca/tmmis/getAdminReport.do?function=prepareMemberAttendanceReport"

        # scrape attendance

        tmpdir = tempfile.mkdtemp()

        page = self.lxmlize("http://app.toronto.ca/tmmis/getAdminReport.do?function=prepareMemberAttendanceReport")
        members = page.xpath('//td[@class="inputText"]/select[@name="memberId"]/option')
        for member in members:
            post = {
                "function": "getMemberAttendanceReport",
                "download": "csv",
                "exportPublishReportId": 1,
                "termId": 4,
                "memberId": member.attrib["value"],
                "decisionBodyId": 0,
            }
            r = self.post("http://app.toronto.ca/tmmis/getAdminReport.do", data=post)
            if r.headers["content-type"] != "application/vnd.ms-excel":
                continue

            attendance_file = open(tmpdir + "/" + member.text + ".csv", "w")
            attendance_file.write(r.text)
            attendance_file.close()

        # scrape events
        post = {
            "function": "getMeetingScheduleReport",
            "download": "csv",
            "exportPublishReportId": 3,
            "termId": 4,
            "decisionBodyId": 0,
        }

        r = self.post("http://app.toronto.ca/tmmis/getAdminReport.do", data=post)
        empty = []

        meeting_file = open("meetings.csv", "w")
        meeting_file.write(r.text)
        meeting_file.close()
        with open("meetings.csv", "rb") as csvfile:
            csvfile = csv.reader(csvfile, delimiter=",")
            next(csvfile)

            committee = ""
            agenda_items = []

            for row in csvfile:
                name = row[0]
                when = row[2]
                when = dt.datetime.strptime(when, "%Y-%m-%d")
                location = row[5]

                if name != committee:
                    committee = name
                    agenda_items = find_items(committee)

                e = Event(name=name, session=self.session, when=when, location=location)

                attendees = find_attendees(tmpdir, row)
                if len(attendees) == 0:
                    empty.append(row)
                for attendee in find_attendees(tmpdir, row):
                    e.add_person(attendee)
                e.add_source("http://app.toronto.ca/tmmis/getAdminReport.do?function=prepareMeetingScheduleReport")

                for item in agenda_items:
                    if item["date"].date() == when.date():
                        i = e.add_agenda_item(item["description"])
                        i.add_committee(committee)
                        i["order"] = item["order"]

                        for link in item["links"]:
                            i.add_media_link(link["name"], link["url"], on_duplicate="ignore")

                        if "notes" in item:
                            i["notes"] = [item["notes"]]

                yield e

        shutil.rmtree(tmpdir)
        os.remove("meetings.csv")
Esempio n. 11
0
    def get_events(self):
        "http://app.toronto.ca/tmmis/getAdminReport.do?function=prepareMeetingScheduleReport"
        "http://app.toronto.ca/tmmis/getAdminReport.do?function=prepareMemberAttendanceReport"

        # scrape attendance

        tmpdir = tempfile.mkdtemp()

        page = self.lxmlize(
            "http://app.toronto.ca/tmmis/getAdminReport.do?function=prepareMemberAttendanceReport"
        )
        members = page.xpath(
            '//td[@class="inputText"]/select[@name="memberId"]/option')
        for member in members:
            post = {
                'function': 'getMemberAttendanceReport',
                'download': 'csv',
                'exportPublishReportId': 1,
                'termId': 4,
                'memberId': member.attrib['value'],
                'decisionBodyId': 0,
            }
            r = self.post("http://app.toronto.ca/tmmis/getAdminReport.do",
                          data=post)
            if r.headers['content-type'] != 'application/vnd.ms-excel':
                continue

            attendance_file = open(tmpdir + '/' + member.text + '.csv', 'w')
            attendance_file.write(r.text)
            attendance_file.close()

        # scrape events
        post = {
            'function': 'getMeetingScheduleReport',
            'download': 'csv',
            'exportPublishReportId': 3,
            'termId': 4,
            'decisionBodyId': 0,
        }

        r = self.post("http://app.toronto.ca/tmmis/getAdminReport.do",
                      data=post)
        empty = []

        meeting_file = open('meetings.csv', 'w')
        meeting_file.write(r.text)
        meeting_file.close()
        with open('meetings.csv', 'rb') as csvfile:
            csvfile = csv.reader(csvfile, delimiter=',')
            next(csvfile)

            committee = ''
            agenda_items = []

            for row in csvfile:
                name = row[0]
                when = row[2]
                when = dt.datetime.strptime(when, "%Y-%m-%d")
                location = row[5]

                if name != committee:
                    committee = name
                    agenda_items = find_items(committee)

                e = Event(name=name,
                          session=self.session,
                          when=when,
                          location=location)

                attendees = find_attendees(tmpdir, row)
                if len(attendees) == 0:
                    empty.append(row)
                for attendee in find_attendees(tmpdir, row):
                    e.add_person(attendee)
                e.add_source(
                    "http://app.toronto.ca/tmmis/getAdminReport.do?function=prepareMeetingScheduleReport"
                )

                for item in agenda_items:
                    if item['date'].date() == when.date():
                        i = e.add_agenda_item(item['description'])
                        i.add_committee(committee)
                        i['order'] = item['order']

                        for link in item['links']:
                            i.add_media_link(link['name'],
                                             link['url'],
                                             on_duplicate='ignore')

                        if 'notes' in item:
                            i['notes'] = [item['notes']]

                yield e

        shutil.rmtree(tmpdir)
        os.remove('meetings.csv')
Esempio n. 12
0
    def scrape(self):
        for c in senate_base:
            m = {}
            m['notice'] = c.xpath('.//p/span[@class="cal_special"]/text()')
            link = c.xpath('.//h3/a/@href')
            print('top link: ', c.xpath('.//h3/*'))
            if len(link) > 0:
                m['link'] = c.xpath('.//h3/a/@href')[0]
                m['title'] = c.xpath('.//h3/a/text()')[0]
            else:
                m['link'] = 'https://www.leg.state.mn.us/cal?type=all'
                m['title'] = c.xpath('.//h3/text()')[0]
            print('top link 2: ', c.xpath('.//h3/text()'))
            info_div = c.xpath('.//div[@class="calendar_p_indent"]')
            if len(info_div) > 0:
                info_div = info_div[0]
                info_list = info_div.xpath('.//text()')
                nchairs = []
                agenda = False
                for il in info_list:
                    il = il.replace('\xa0', '')
                    if il.startswith(' and '):
                        il = il.replace(' and ', '')
                    if il.startswith('Room'):
                        m['room'] = il
                    if il.startswith('Rep.') or il.startswith('Sen.'):
                        cname = pull_middle_name(il[4:])
                        nchairs.append(cname.strip())
                    if agenda == True:
                        m['agenda'] = il
                    if il == 'Agenda: ':
                        agenda = True
                m['chair'] = nchairs
            if len(m['notice']) > 0:
                m['notice'] = m['notice'][0]
            else:
                m['notice'] = 'N/A'
            ppr(m)
            date = c.xpath('.//p/span/text()')
            if len(date) < 1:
                print('\n\n\n\n NO DATE')
                ppr(m)
                continue
            if 'or' in date[0]:
                date[0] = date[0].split('or')[0]
            m['date'] = datetime.datetime.strptime(date[0].replace('\xa0', ''),
                                                   format1)
            ppr(m)
            if not 'room' in m.keys():
                print('oops')
                m['room'] = 'Senate in session'
            event = Event(name=m['title'],
                          start_date=tz.localize(m['date']),
                          location_name=m['room'])

            if len(m['notice']) > 0:
                pass
            event.add_committee(m['title'])
            event.add_source(m['link'])
            for chair in m['chair']:
                event.add_person(name=chair, note="Chair")
            yield event
Esempio n. 13
0
    def scrape(self):
        for c in comm_base:
            print(c.xpath('.//h3/a/text()'))
        for c in comm_base:
            m = {}
            m['notice'] = c.xpath('.//p/span[@class="cal_special"]/text()')
            print(c.xpath('.//h3/*'))
            title = c.xpath('.//h3/a/text()')
            if len(title) == 0:
                continue
            else:
                m['title'] = title[0]
            m['link'] = c.xpath('.//h3/a/@href')[0]
            info_div = c.xpath('.//div[@class="calendar_p_indent"]')[0]
            print('one info div')
            if info_div is not None:
                info_list = info_div.xpath('.//text()')
                if info_list[0] == 'Room: ':
                    m['room'] = info_list[1]
                if info_list[1] == 'Chair: ':
                    chair = info_list[2]
                    if ',' in chair:
                        chairs = chair.replace('\xa0', '').split(',')
                        nchairs = []
                        for chair in chairs:
                            if chair.startswith('Rep.') or chair.startswith(
                                    'Sen.'):
                                cname = pull_middle_name(chair[4:])
                                nchairs.append(cname.strip())
                        m['chair'] = nchairs
                    elif chair.startswith('Rep.') or chair.startswith('Sen.'):
                        cname = pull_middle_name(chair[4:].strip())
                        m['chair'] = [cname.strip()]
                if info_list[2] == 'Chair: ':
                    chair = info_list[3]
                    if ',' in chair:
                        chairs = chair.replace('\xa0', '').split(',')
                        nchairs = []
                        for chair in chairs:
                            if chair.startswith('Rep.') or chair.startswith(
                                    'Sen.'):
                                cname = pull_middle_name(chair[4:])
                                nchairs.append(cname.strip())
                        m['chair'] = nchairs
                    elif chair.startswith('Rep.') or chair.startswith('Sen.'):
                        cname = pull_middle_name(chair[4:].strip())
                        m['chair'] = [cname.strip()]
                if info_list[4] == 'Agenda: ':
                    m['agenda'] = info_list[5]

            if len(m['notice']) > 0:
                m['notice'] = m['notice'][0]
            else:
                m['notice'] = 'N/A'
            ppr(m)
            date = c.xpath('.//p/b/text()')
            if len(date) < 1:
                print('\n\n\n\n NO DATE')
                ppr(m)
                continue
            m['date'] = datetime.datetime.strptime(date[0], format1)

            event = Event(name=m['title'],
                          start_date=tz.localize(m['date']),
                          location_name=m['room'])

            if len(m['notice']) > 0:
                pass
            event.add_committee(m['title'])
            event.add_source(m['link'])
            for chair in m['chair']:
                event.add_person(name=chair, note="Chair")
            yield event
Esempio n. 14
0
    def scrape(self):
        for c in house_base:
            m = {}
            m['notice'] = c.xpath('.//p/span[@class="cal_special"]/text()')
            links = c.xpath('.//h3/a/@href')            
            if len(links) > 0:
                m['cmt'] = c.xpath('.//h3/a/text()')[0]
                m['link'] = c.xpath('.//h3/a/@href')[0]
                title = c.xpath('.//h3/text()')[0]
                if title == 'Agenda:':
                    m['title'] = c.xpath('.//h3/a/text()')[0]
                else:
                    m['title'] = c.xpath('.//h3/text()')[0]
                
            else:
                m['title'] = c.xpath('.//h3/text()')[0]
                m['link'] = None
            info_div = c.xpath('.//*[@class="calendar_p_indent"]')
            if len(info_div) == 0:
                pass
            else:
                info_div = info_div[0]
            print('Info Div: ', info_div)
            if len(info_div) > 0:
                info_list = info_div.xpath('.//text()')
                info_links = info_div.xpath('.//*/@href')
                print("info links: ", info_links)
                info_list = [x.replace('\n', '').strip() for x in info_list]
                info_list = [x for x in info_list if len(x) > 0]
                print('Info list: ', info_list)
                if info_list[0].startswith('Room:'):
                    m['room'] = info_list[1]
                else:
                    m['room'] = 'n/a'
                if len(info_list) > 2:
                    if info_list[2].startswith('Chair:'):
                        chair = info_list[3]
                        if ',' in chair:
                            chairs = chair.replace('\xa0', '').split(',')
                            nchairs = []
                            for chair in chairs:
                                if chair.startswith('Rep.') or chair.startswith('Sen.'):
                                    cname = pull_middle_name(chair[4:])
                                    nchairs.append(cname.strip())
                            m['chair'] = nchairs
                        elif chair.startswith('Rep.') or chair.startswith('Sen.'):
                            cname = pull_middle_name(chair[4:].strip())
                            m['chair'] = [cname.strip()]
                else:
                    m['chair'] = None
            
            bill_rows = c.xpath(('.//*/table[@class="cal_bills"]/tbody/tr'))
            print('Bills: ', bill_rows)
            bills = []
            for brs in bill_rows:
                cells = brs.xpath('.//td')
                if len(cells) == 3:
                    b = {}
                    b['bill'] = cells[0].xpath('.//text()')[0]
                    b['author'] = cells[1].xpath('./text()')[0]
                    b['summary'] = cells[2].xpath('./text()')[0]
                    bills.append(b)
            if len(m['notice']) > 0:
                m['notice'] = m['notice'][0]
            else:
                m['notice'] = 'N/A'
            date = c.xpath('.//p/b/text()')
            if len(date) < 1:
                print('\n\n\n\n NO DATE')
                continue
            m['date'] = datetime.datetime.strptime(date[0], format1)

            if 'House Meets in Session' in m['title']:
                m['room'] = 'State leg'
                m['cmt'] = 'Minnesota House of Representatives'
                m['chair'] = None
                m['link'] = 'https://www.leg.state.mn.us/cal?type=all'
            event = Event(name=m['title'],
                          start_date=tz.localize(m['date']),
                          location_name=m['room'] 
            )
            if len(bills) > 0:
                for bill in bills:
                    nbill = event.add_agenda_item(description=bill['summary'])
                    nbill.add_bill(bill['bill'].replace('HF', 'HF '))
            if len(m['notice']) > 0:
                pass
            event.add_committee(m['cmt'])
            if m['link'] is not None:
                event.add_source(m['link'])
            if m['chair'] is not None:
                for chair in m['chair']:
                   event.add_person(name=chair, note="Chair")
            yield event