コード例 #1
0
    def scrape(self):
        for page in self.eventPages(EVENTSPAGE):
            events_table = page.xpath("//table[@class='rgMasterTable']")[0]
            for events, headers, rows in self.parseDataTable(events_table) :
                print(events)
                location_string = events[u'Meeting\xa0Location']
                location_list = location_string.split('--')
                location = ', '.join(location_list[0:2])

                status_string = location_list[-1].split('Chicago, Illinois')
                if len(status_string) > 1 and status_string[1] :
                    status = status_string[1].lower()
                    if status not in ['cancelled', 'tentative', 'confirmed', 'passed'] :
                        print(status)
                        status = 'confirmed'
                else :
                    status = 'confirmed'



                when = events[u'Meeting\xa0Date']
                time_string = events[u'Meeting\xa0Time']
                event_time = datetime.datetime.strptime(time_string,
                                                        "%I:%M %p")
                when = when.replace(hour=event_time.hour)

                e = Event(name=events["Name"]["label"],
                          when=when,
                          location=location,
                          status=status)
                e.add_source(EVENTSPAGE)
                if events['Video'] != u'Not\xa0available' :
                    print(events['Video'])

                yield e
コード例 #2
0
ファイル: events.py プロジェクト: cliftonmcintosh/openstates
    def scrape_committee_events(self, code, name):
        events_url = \
                'http://www.cga.ct.gov/basin/fullcalendar/commevents.php?' \
                'comm_code={}'.format(code)
        events_data = self.get(events_url).text
        events = json.loads(events_data)

        DATETIME_FORMAT = '%Y-%m-%dT%H:%M:%SZ'
        for info in events:

            if info['title'] is None:
                self.warning("Event found with no title; it will be skipped")
                continue
            elif info['title'].startswith('CANCELLED:'):
                self.info("Cancelled event found; it will be skipped: {}".
                          format(info['title']))
                continue

            when = datetime.datetime.strptime(info['start'], DATETIME_FORMAT)
            # end = datetime.datetime.strptime(info['end'], DATETIME_FORMAT)
            where = "{0} {1}".format(info['building'].strip(), info['location'].strip())
            # end_time=self._tz.localize(end),
            event = Event(start_time=self._tz.localize(when),
                          timezone=self._tz.zone,
                          location_name=where,
                          name=info['title'],
                          description=info['title'],)
            event.add_source(events_url)

            yield event
コード例 #3
0
ファイル: events.py プロジェクト: neelneelpurk/openstates
    def scrape_upper(self):
        url = "http://www.oksenate.gov/Committees/meetingnotices.htm"
        page = lxml.html.fromstring(self.get(url).text)
        page.make_links_absolute(url)

        text = page.text_content()
        _, text = text.split('MEETING NOTICES')
        re_date = r'[A-Z][a-z]+,\s+[A-Z][a-z]+ \d+, \d{4}'
        chunks = zip(re.finditer(re_date, text), re.split(re_date, text)[1:])

        for match, data in chunks:
            when = match.group()
            when = datetime.datetime.strptime(when, "%A, %B %d, %Y")

            lines = filter(None, [x.strip() for x in data.splitlines()])
            time_ = re.search(r'^\s*TIME:\s+(.+?)\s+\x96', data, re.M).group(1)
            time_ = time_.replace('a.m.', 'AM').replace('p.m.', 'PM')
            time_ = time.strptime(time_, '%I:%M %p')
            when += datetime.timedelta(hours=time_.tm_hour, minutes=time_.tm_min)

            title = lines[0]

            where = re.search(r'^\s*PLACE:\s+(.+)', data, re.M).group(1)
            where = where.strip()

            event = Event(name=title,
                          start_date=self._tz.localize(when),
                          location_name=where)
            event.add_source(url)

            yield event
コード例 #4
0
ファイル: events.py プロジェクト: sunlightlabs/openstates
    def scrape_meetings(self, meetings, group):
        """
        Scrape and save event data from a list of meetings.

        Arguments:
        meetings -- A list of lxml elements containing event information
        group -- The type of meeting. The legislature site applies
                 different formatting to events based on which group
                 they correspond to.  `group` should be one of the
                 following strings: 'house', 'senate', or 'commission'.

        """
        for meeting in meetings:
            when = self.get_date(meeting)
            description = self.get_description(meeting)
            location = self.get_location(meeting)

            if when and description and location:
                event = Event(name=description, start_date=when.replace(tzinfo=self.tz),
                              description=description,
                              location_name=location)
                agenda = self.get_agenda(meeting)
                if agenda:
                    event.add_agenda_item(agenda)
                event.add_source(url)
                yield event
コード例 #5
0
def event_obj():
    e = Event(
        name="get-together",
        start_date=datetime.datetime.utcnow().isoformat().split('.')[0] + 'Z',
        location_name="Joe's Place",
    )
    e.add_source(url='http://example.com/foobar')
    return e
コード例 #6
0
def ge():
    event = ScrapeEvent(
        name="America's Birthday",
        start_time="2014-07-04T05:00Z",
        location_name="America",
        timezone="America/New_York",
        all_day=True)
    event.add_person("George Washington")
    return event
コード例 #7
0
ファイル: test_event_scrape.py プロジェクト: anukat2015/pupa
def event_obj():
    e = Event(
        name="get-together",
        start_time=datetime.datetime.utcnow(),
        location_name="Joe's Place",
        timezone="America/New_York",
    )
    e.add_source(url='foobar')
    return e
コード例 #8
0
ファイル: events.py プロジェクト: neelneelpurk/openstates
    def scrape(self, chamber=None):
        URL = 'http://utahlegislature.granicus.com/ViewPublisherRSS.php?view_id=2&mode=agendas'
        doc = self.lxmlize(URL)
        events = doc.xpath('//item')

        for info in events:
            title_and_date = info.xpath('title/text()')[0].split(" - ")
            title = title_and_date[0]
            when = title_and_date[-1]
            # if not when.endswith(session[ :len("20XX")]):
            #    continue

            event = Event(name=title,
                          start_date=self._tz.localize(datetime.datetime.strptime(when,
                                                                                  '%b %d, %Y')),
                          location_name='State Capitol'
                          )
            event.add_source(URL)

            url = re.search(r'(http://.*?)\s', info.text_content()).group(1)
            try:
                doc = self.lxmlize(url)
            except HTTPError:
                self.logger.warning("Page missing, skipping")
                continue
            event.add_source(url)

            committee = doc.xpath('//a[text()="View committee page"]/@href')
            if committee:
                committee_doc = self.lxmlize(committee[0])
                committee_name = committee_doc.xpath(
                        '//h3[@class="heading committee"]/text()')[0].strip()
                event.add_participant(committee_name, type='committee',
                                      note='host')

            documents = doc.xpath('.//td')
            for document in documents:
                url = re.search(r'(http://.*?pdf)', document.xpath('@onclick')[0])
                if url is None:
                    continue
                url = url.group(1)
                event.add_document(
                        note=document.xpath('text()')[0],
                        url=url,
                        media_type='application/pdf'
                        )
                bills = document.xpath('@onclick')
                for bill in bills:
                    if "bills/static" in bill:
                        bill_name = bill.split("/")[-1].split(".")[0]
                        item = event.add_agenda_item('Bill up for discussion')
                        item.add_bill(bill_name)
            yield event
コード例 #9
0
ファイル: events.py プロジェクト: sunlightlabs/openstates
    def scrape(self):
        page = self.lxmlize(calurl)
        events = page.xpath("//table[@class='agenda-body']//tr")[1:]

        for event in events:
            comit_url = event.xpath(
                ".//a[contains(@href, '/Pages/comm-info.aspx?c=')]")

            if len(comit_url) != 1:
                raise Exception

            comit_url = comit_url[0]
            who = self.scrape_participants(comit_url.attrib['href'])

            tds = event.xpath("./*")
            date = tds[0].text_content().strip()
            cttie = tds[1].text_content().strip()
            _chamber, cttie = [x.strip() for x in cttie.split(" - ", 1)]
            info = tds[2]
            name = info.xpath("./a[contains(@href, 'raw')]")[0]
            notice = name.attrib['href']
            name = name.text
            time, where = info.xpath("./i/text()")
            what = tds[3].text_content()
            what = what.replace("Items: ", "")
            if "(None)" in what:
                continue
            what = [x.strip() for x in what.split(";")]

            when = ", ".join([date, str(dt.datetime.now().year), time])
            when = dt.datetime.strptime(when, "%a %b %d, %Y, %I:%M %p")

            event = Event(
                name=name,
                location_name=where,
                start_date=self._tz.localize(when),
            )

            event.add_source(calurl)

            event.add_committee(cttie, note='host')

            event.add_document("notice", notice, media_type='application/pdf')

            for entry in what:
                item = event.add_agenda_item(entry)
                if entry.startswith('AB') or entry.startswith('SB'):
                    item.add_bill(entry)

            for thing in who:
                event.add_person(thing['name'])

            yield event
コード例 #10
0
    def scrape_event_page(self, event):
        url = event.attrib['href']
        page = self.lxmlize(url)
        title = page.xpath("//h2[@class='evlist_header']")
        title = title[0].text.strip() if title else None
        if title is None:
            return
        if "CANCELED" in title:
            return

        info = page.xpath("//div[@style='position:relative;margin-right:40px;']")[0]
        blocks = info.xpath(".//div")
        ret = {}
        for block in blocks:
            els = block.xpath("./*")
            if not els:
                continue
            le = els[0]

            if le.tag != 'label':
                continue

            label, div = els

            ltex = label.text_content().strip()
            dtex = div.text_content().strip()
            ret[ltex] = dtex

        when = dt.datetime.utcnow()
        date, start, end = (x.strip() for x in ret['When:'].split("\n"))
        start = re.sub("^@", "", start).strip()
        end = end.replace("-", "").strip()

        replace = [
            ('Apr', 'April'),
        ]

        skip = ["Occurs every"]

        for k, v in replace:
            date = date.replace(k, v).strip()

        if True in (x in end for x in skip):
            return

        start = "%s %s" % (date, start)
        end = "%s %s" % (date, end)
        start, end = (dt.datetime.strptime(x, "%B %d, %Y %I:%M %p") for x in (start, end))

        event = Event( name=title, location=ret['Where:'], when=start, end=end)
        event.add_source(url)
        yield event
コード例 #11
0
ファイル: events.py プロジェクト: cliftonmcintosh/openstates
    def scrape_chamber(self, chamber):
        url = utils.urls['events'][chamber]
        page = self.get(url).text
        page = lxml.html.fromstring(page)
        page.make_links_absolute(url)

        for table in page.xpath('//table[@class="CMS-MeetingDetail-CurrMeeting"]'):
            date_string = table.xpath('ancestor::div[@class="CMS-MeetingDetail"]/div/a/@name')[0]
            for row in table.xpath('tr'):
                time_string = row.xpath('td[@class="CMS-MeetingDetail-Time"]/text()')[0].strip()
                description = row.xpath(
                    'td[@class="CMS-MeetingDetail-Agenda"]/div/div'
                )[-1].text_content().strip()
                location = row.xpath(
                    'td[@class="CMS-MeetingDetail-Location"]'
                )[0].text_content().strip()
                committees = row.xpath('.//div[@class="CMS-MeetingDetail-Agenda-CommitteeName"]/a')
                bills = row.xpath('.//a[contains(@href, "billinfo")]')

                try:
                    start_time = datetime.datetime.strptime(
                        '{} {}'.format(date_string, time_string),
                        '%m/%d/%Y %I:%M %p',
                    )
                except ValueError:
                    break

                event = Event(
                    name=description,
                    start_time=self._tz.localize(start_time),
                    location_name=location,
                    timezone=self._tz.zone,
                )
                event.add_source(url)

                if bills or committees:
                    item = event.add_agenda_item(description)
                    for bill in bills:
                        parsed = urllib.parse.urlparse(bill.get('href'))
                        qs = urllib.parse.parse_qs(parsed.query)
                        item.add_bill('{}{} {}'.format(qs['body'], qs['type'], qs['bn']))
                    for committee in committees:
                        parsed = urllib.parse.urlparse(committee.get('href'))
                        qs = urllib.parse.parse_qs(parsed.query)
                        item.add_committee(
                            re.sub(r' \([S|H]\)$', '', committee.text),
                            id=qs.get('Code'),
                        )

                yield event
コード例 #12
0
    def categorize_data(self, csv_data):
        return_objs = []
        Contribution = namedtuple('Contribution', self.csv_header_row.replace(' ', '_'))
        for line in csv_data.split('\n'): # explicity defining delimiter because otherwise fails in case of single line
            if not line:
                continue

            # cur_obj will be the person or organization that made the contribution
            cur_obj = None
            contribution = Contribution(*line.split(','))
            
            if contribution.Contributor_Type in self.business_contribution_types:
                cur_obj = Organization(contribution.Contributor_Name)
            elif contribution.Contributor_Type in self.individual_contribution_types:
                cur_obj = Person(contribution.Contributor_Name)
            elif contribution.Contributor_Type == 'Unknown/Anonymous':
                if contribution.Contributor_Name: #ignoring un-named contributors
                    #these look like catch-all business contributions
                    cur_obj = Organization(contribution.Contributor_Name)
            if cur_obj: 
                #we don't set cur_obj in the event that there was an 
                #anonymous/unknown contribution without a Contribution_Name
                #so we need to check that it exists before adding to it
                cur_obj.add_source(url=self.search_url)
                cur_obj.source_identified = True
                if contribution.Contributor_Address:
                    cur_obj.add_contact_detail(type='address', value=contribution.Contributor_Address)
                if contribution.Employer_Name:
                    cur_obj.extras['Employer'] = contribution.Employer_Name
                if contribution.Employer_Occupation:
                    cur_obj.extras['Occupation'] = contribution.Employer_Occupation
                
                #recipiant_obj is the organization that received the contribution
                recipiant_obj = Organization(contribution.Receiving_Committee)  
                recipiant_obj.extras['Office'] = contribution.Office
                recipiant_obj.extras['Filing Period'] = contribution.Filing_Period
                recipiant_obj.extras['Fundtype'] = contribution.Fundtype

                #transaction is the event linking the donor and recipiant
                transaction = Event('Contribution', contribution.Contribution_Date, 'EST', 'Maryland') #EST and Maryland b/c MD
                transaction.extras['Contribution Amount'] = contribution.Contribution_Amount
                transaction.extras['Contribution Type'] = contribution.Contribution_Type
                transaction.add_source(url=self.search_url)
                #transaction.source_identified = True
                transaction.participants.append(cur_obj.as_dict())
                transaction.participants.append(recipiant_obj.as_dict())
                yield (cur_obj, recipiant_obj, transaction)        
            else:
                yield []
コード例 #13
0
ファイル: events.py プロジェクト: neelneelpurk/openstates
    def scrape(self):
        EVENTS_URL = 'http://www.akleg.gov/basis/Meeting/Find'
        events = self.lxmlize(EVENTS_URL).xpath('//ul[@id="meetingResults"]/li')
        for info in events:
            event_url = info.xpath('span[@class="col04"]/a/@href')[0]
            doc = self.lxmlize(event_url)

            # Skip events that are placeholders or tentative
            # Also skip whole-chamber events
            if any(x.strip().startswith("No Meeting") for x in
                    doc.xpath('//div[@class="schedule"]//text()')) \
                    or "session" in \
                    info.xpath('span[@class="col01"]/text()')[0].lower():
                continue

            name = " ".join(
                x.strip()
                for x in doc.xpath('//div[@class="schedule"]//text()')
                if x.strip()
            )

            # Skip events with no name
            if not name:
                continue

            event = Event(
                start_date=self._TZ.localize(
                    datetime.datetime.strptime(
                        info.xpath('span[@class="col02"]/text()')[0],
                        self._DATETIME_FORMAT,
                    )
                ),
                name=name,
                location_name=doc.xpath(
                    '//div[@class="heading-container"]/span/text()'
                )[0].title()
            )

            event.add_participant(
                info.xpath('span[@class="col01"]/text()')[0].title(),
                type='committee',
                note='host',
            )

            for document in doc.xpath('//td[@data-label="Document"]/a'):
                event.add_document(
                    document.xpath('text()')[0],
                    url=document.xpath('@href')[0]
                )

            event.add_source(EVENTS_URL)
            event.add_source(event_url.replace(" ", "%20"))

            yield event
コード例 #14
0
ファイル: events.py プロジェクト: cliftonmcintosh/openstates
    def scrape(self, session=None, chamber=None):
        if not session:
            session = self.latest_session()
            self.info('no session specified, using %s', session)

        url = "ftp://www.arkleg.state.ar.us/dfadooas/ScheduledMeetings.txt"
        page = self.get(url)
        page = csv.reader(StringIO(page.text), delimiter='|')

        for row in page:
            # Deal with embedded newline characters, which cause fake new rows
            LINE_LENGTH = 11
            while len(row) < LINE_LENGTH:
                row += next(page)

            desc = row[7].strip()

            match = re.match(r'^(.*)- (HOUSE|SENATE)$', desc)
            if match:

                comm = match.group(1).strip()
                comm = re.sub(r'\s+', ' ', comm)
                location = row[5].strip() or 'Unknown'
                when = datetime.datetime.strptime(row[2], '%Y-%m-%d %H:%M:%S')
                when = self._tz.localize(when)
                # Only assign events to a session if they are in the same year
                # Given that session metadata have some overlap and
                # missing end dates, this is the best option available
                session_year = int(session[:4])
                if session_year != when.year:
                    continue

                description = "%s MEETING" % comm
                event = Event(
                        name=description,
                        start_time=when,
                        location_name=location,
                        description=description,
                        timezone=self._tz.zone
                )
                event.add_source(url)

                event.add_participant(comm, type='committee', note='host')
                # time = row[3].strip()
                # if time in TIMECODES:
                #     event['notes'] = TIMECODES[time]

                yield event
コード例 #15
0
    def scrape(self):
        curdate = None
        page = self.lxmlize(CAL_PAGE)
        for el in page.xpath("//div[@id='Section1']/*"):
            if el.tag[0] == 'h':
                when = WHEN.findall(el.text_content())
                when = when[0] if when else None
                if when is None:
                    continue
                curdate = " ".join(when)

            if (el.tag == 'p'): # and el.attrib.get('class') == 'MsoNormal'):

                els = el.xpath("./*")
                agenda = el.xpath(".//a[contains(@href, 'Archive.aspx')]")
                agenda = agenda[0] if agenda else None
                if agenda is None:
                    continue

                info = self.cleanup(el.text_content())
                when = DT.findall(info)
                when = when[0] if when else None
                if when is None:
                    continue

                people = el.xpath(".//personname")
                places = el.xpath(".//place")
                time, ampm = when

                if curdate is None:
                    self.warning("Can't scrape, since I don't know what date it is")
                    continue

                tbuf = " ".join([curdate, time, ampm])
                obj = dt.datetime.strptime(tbuf, "%B %d %Y %I:%M %p")

                try:
                    _, where = info.rsplit(u"–", 1)
                except ValueError:
                    continue

                where = where.replace(u" ", " ")
                where  = re.sub("\s+", " ", where).strip()
                where = re.sub("agenda$", "", where).strip()

                event = Event(name=info, when=obj, location=where)
                event.add_source(CAL_PAGE)
                yield event
コード例 #16
0
ファイル: events.py プロジェクト: neelneelpurk/openstates
    def scrape_event_page(self, session, chamber, url, datetime):
        page = self.lxmlize(url)
        info = page.xpath("//p")
        metainfo = {}
        plaintext = ""
        for p in info:
            content = re.sub("\s+", " ", p.text_content())
            plaintext += content + "\n"
            if ":" in content:
                key, val = content.split(":", 1)
                metainfo[key.strip()] = val.strip()
        committee = metainfo['COMMITTEE']
        where = metainfo['PLACE']
        if "CHAIR" in where:
            where, chair = where.split("CHAIR:")
            metainfo['PLACE'] = where.strip()
            metainfo['CHAIR'] = chair.strip()

        chair = None
        if "CHAIR" in metainfo:
            chair = metainfo['CHAIR']

        plaintext = re.sub("\s+", " ", plaintext).strip()
        regexp = r"(S|J|H)(B|M|R) (\d+)"
        bills = re.findall(regexp, plaintext)

        event = Event(
            name=committee,
            start_date=self._tz.localize(datetime),
            location_name=where
        )

        event.add_source(url)
        event.add_participant(committee, type='committee', note='host')
        if chair is not None:
            event.add_participant(chair, type='legislator', note='chair')

        for bill in bills:
            chamber, type, number = bill
            bill_id = "%s%s %s" % (chamber, type, number)
            item = event.add_agenda_item('Bill up for discussion')
            item.add_bill(bill_id)

        event.add_agenda_item(plaintext)

        yield event
コード例 #17
0
ファイル: events.py プロジェクト: neelneelpurk/openstates
    def scrape_events(self, chamber, event_id):
        url = '%s%s' % (self.upper_url, event_id)
        html = self.get(url).text
        doc = lxml.html.fromstring(html)
        doc.make_links_absolute(url)
        rows = doc.xpath("//div[@id='WebPartWPQ2']")
        # some ids are empty
        if len(rows):
            table_data = rows[0].find('table')[1]

            for link in table_data.iterchildren('td'):
                td = link.xpath('//td[@class="ms-formbody"]')

                description = td[18].text
                when = td[19].text
                where = td[25].text
                # type = td[27].text
                meeting_lead = td[28].text

                when = datetime.datetime.strptime(when, "%m/%d/%Y  %H:%M %p")
                when = self._tz.localize(when)

                if where is None or where == "":
                    where = 'State House'
                event = Event(name=description,
                              start_date=when,
                              location_name=where)
                if td[20].text is None:
                    participants = meeting_lead
                else:
                    participants = td[20].text.split(';')
                if participants:
                    for participant in participants:
                        name = participant.strip().replace('HON.', '', 1)
                        if name != "":
                            event.add_participant(name, type='committee',
                                                  note='host')

                event.add_source(url)
                yield event
        else:
            # hack so we dont fail on the first id numbers where there are some gaps
            # between the numbers that work and not.
            if event_id > 1700:
                raise Exception("Parsing is done we are on future ids that are not used yet.")
コード例 #18
0
    def categorize_data(self, csv_data):
        #Is there a better place to define this?
        return_objs = []
        Contribution = namedtuple('Contribution', self.csv_header_row.replace(' ', '_'))
        for line in csv_data.split('\n'): # explicity defining delimiter because otherwise fails in case of single line
            if not line:
                continue
            cur_obj = None
            try:
                contribution = Contribution(*line.split(','))
            except Exception as e:
                import pdb; pdb.set_trace()
            if contribution.Contributor_Type in self.business_contribution_types:
                cur_obj = Organization(contribution.Contributor_Name)
            elif contribution.Contributor_Type in self.individual_contribution_types:
                cur_obj = Person(contribution.Contributor_Name)
            elif contribution.Contributor_Type == 'Unknown/Anonymous':
                if contribution.Contributor_Name: #ignoring un-named contributors
                    #these look like catch-all business contributions
                    cur_obj = Organization(contribution.Contributor_Name)
            if cur_obj: 
                cur_obj.add_source(url=self.search_url)
                cur_obj.source_identified = True
                if contribution.Contributor_Address:
                    cur_obj.add_contact_detail(type='address', value=contribution.Contributor_Address)
                if contribution.Employer_Name:
                    cur_obj.extras['Employer'] = contribution.Employer_Name
                if contribution.Employer_Occupation:
                    cur_obj.extras['Occupation'] = contribution.Employer_Occupation
                
                recipiant_obj = Organization(contribution.Receiving_Committee)  
                recipiant_obj.extras['Office'] = contribution.Office
                recipiant_obj.extras['Filing Period'] = contribution.Filing_Period
                recipiant_obj.extras['Fundtype'] = contribution.Fundtype

                transaction = Event('Contribution', contribution.Contribution_Date, 'EST', 'Maryland') #EST and Maryland b/c MD
                transaction.extras['Contribution Amount'] = contribution.Contribution_Amount
                transaction.extras['Contribution Type'] = contribution.Contribution_Type
                transaction.add_source(url=self.search_url)
                #transaction.source_identified = True
                transaction.participants.append(cur_obj.as_dict())
                transaction.participants.append(recipiant_obj.as_dict())
                yield (cur_obj, recipiant_obj, transaction)        
            else:
                yield [] 
コード例 #19
0
ファイル: events.py プロジェクト: cliftonmcintosh/openstates
    def scrape(self, session=None):
        if session is None:
            session = self.latest_session()
        year_slug = session[5:]

        url = 'http://legislature.vermont.gov/committee/loadAllMeetings/{}'.\
                format(year_slug)

        json_data = self.get(url).text
        events = json.loads(json_data)['data']

        for info in events:
            # Determine when the committee meets
            if info['TimeSlot'] == '1':
                start_time = datetime.datetime.strptime(info['MeetingDate'], '%A, %B %d, %Y')
                all_day = True
            else:
                try:
                    start_time = datetime.datetime.strptime(
                        info['MeetingDate'] + ', ' + info['TimeSlot'],
                        '%A, %B %d, %Y, %I:%M %p'
                    )
                except ValueError:
                    start_time = datetime.datetime.strptime(
                        info['MeetingDate'] + ', ' + info['StartTime'],
                        '%A, %B %d, %Y, %I:%M %p'
                    )
                all_day = False

            event = Event(
                start_time=self.TIMEZONE.localize(start_time),
                timezone='America/New_York',
                all_day=all_day,
                name="Meeting of the {}".format(info['LongName']),
                description="committee meeting",
                location_name="{0}, Room {1}".format(info['BuildingName'], info['RoomNbr'])
            )
            event.add_source(url)
            event.add_committee(
                name=info['LongName'],
                note='host'
            )

            yield event
コード例 #20
0
ファイル: events.py プロジェクト: cliftonmcintosh/openstates
    def scrape(self):
        tz = pytz.timezone("US/Eastern")
        get_short_codes(self)
        page = self.lxmlize(URL)
        table = page.xpath(
            "//table[@id='ctl00_ContentPlaceHolderCol1_GridView1']")[0]

        for event in table.xpath(".//tr")[1:]:
            tds = event.xpath("./td")
            committee = tds[0].text_content().strip()
            descr = [x.text_content() for x in tds[1].xpath(".//span")]
            if len(descr) != 1:
                raise Exception
            descr = descr[0].replace('.', '').strip()
            when = tds[2].text_content().strip()
            where = tds[3].text_content().strip()
            notice = tds[4].xpath(".//a")[0]
            notice_href = notice.attrib['href']
            notice_name = notice.text
            when = dt.datetime.strptime(when, "%m/%d/%Y %I:%M %p")
            when = pytz.utc.localize(when)
            event = Event(name=descr, start_time=when, classification='committee-meeting',
                          description=descr, location_name=where, timezone=tz.zone)

            if "/" in committee:
                committees = committee.split("/")
            else:
                committees = [committee]

            for committee in committees:
                if "INFO" not in committee:
                    committee = self.short_ids.get("committee", {"chamber": "unknown",
                                                                 "name": committee})

                else:
                    committee = {
                        "chamber": "joint",
                        "name": committee,
                    }
                event.add_committee(committee['name'], note='host')

            event.add_source(URL)
            event.add_document(notice_name,
                               notice_href,
                               media_type='text/html')
            for bill in self.get_related_bills(notice_href):
                a = event.add_agenda_item(description=bill['descr'])
                a.add_bill(
                    bill['bill_id'],
                    note=bill['type']
                )
            yield event
コード例 #21
0
    def scrape(self):
        local_timezone =  pytz.timezone("US/Eastern")
        base_calendar_url = "http://www.miamidade.gov/cob/county-commission-calendar.asp"
        #things get messy more than a few months out
        #so we're just pulling 3 months. If we want three
        #more, they are called "nxx", "nxy" and "nxz"
        months = ["cur","nex","nxw"]
        for m in months:
            doc = self.lxmlize(base_calendar_url + "?next={}".format(m))
            events = doc.xpath("//table[contains(@style,'dotted #ccc')]")
            for event in events:
                rows = event.xpath(".//tr")
                for row in rows:
                    heading, data = row.xpath(".//td")
                    h = heading.text_content().lower().replace(":","").strip()
                    if h == "event":
                        title = data.text_content()
                        link = data.xpath(".//a")[0].attrib["href"]
                    elif h == "event date":
                        when = datetime.strptime(data.text, '%m/%d/%y %H:%M%p')
                        when = local_timezone.localize(when)
                    elif h == "location":
                        where = data.text
                    elif h == "description":
                        description = data.text

                if not description:
                    description = ""

                status = "confirmed"
                if "cancelled" in title.lower():
                    status = "cancelled"

                e = Event(name=title,
                            start_time=when,
                            timezone="US/Eastern",
                            location_name=where,
                            description=description,
                            status=status)
                
                e.add_source(link)
                yield e
コード例 #22
0
ファイル: events.py プロジェクト: sunlightlabs/openstates
    def scrape_house_weekly_schedule(self):
        url = "http://house.louisiana.gov/H_Sched/Hse_MeetingSchedule.aspx"
        page = self.lxmlize(url)

        meeting_rows = page.xpath('//table[@id = "table229"]/tr')

        valid_meetings = [row for row in meeting_rows if row.xpath(
            './td[1]')[0].text_content().replace(u'\xa0', '') and row.xpath(
            './td/a/img[contains(@src, "PDF-AGENDA.png")]') and 'Not Meeting' not in row.xpath(
            './td[2]')[0].text_content()]

        for meeting in valid_meetings:
            try:
                guid = meeting.xpath('./td/a[descendant::img[contains(@src,'
                                     '"PDF-AGENDA.png")]]/@href')[0]
                # self.logger.debug(guid)
                self.warning("logger.debug" + guid)
            except KeyError:
                continue  # Sometimes we have a dead link. This is only on
                # dead entries.

            committee_name = meeting.xpath('./td[1]/text()')[0].strip()
            meeting_string = meeting.xpath('./td[2]')[0].text_content()

            if "@" in meeting_string:
                continue  # Contains no time data.
            date, time, location = ([s.strip() for s in meeting_string.split(
                ',') if s] + [None]*3)[:3]

            # check for time in date because of missing comma
            time_srch = re.search(r'\d{2}:\d{2} (AM|PM)', date)
            if time_srch:
                location = time
                time = time_srch.group()
                date = date.replace(time, '')

            # self.logger.debug(location)
            self.warning("logger.debug" + location)

            year = datetime.datetime.now().year
            datetime_string = ' '.join((date, str(year), time))
            when = datetime.datetime.strptime(datetime_string, '%b %d %Y %I:%M %p')
            when = self._tz.localize(when)

            description = 'Committee Meeting: {}'.format(committee_name)
            # self.logger.debug(description)
            self.warning("logger.debug" + description)

            event = Event(name=description,
                          start_date=self._tz.localize(when),
                          location_name=location)
            event.add_source(url)
            event.add_participant(committee_name, type='committee', note='host')
            event.add_document(note='Agenda', url=guid, text='agenda',
                               media_type='application/pdf')

            yield event
コード例 #23
0
ファイル: events.py プロジェクト: neelneelpurk/openstates
    def scrape_upper_events(self):
        url = "https://www.flsenate.gov/Tracker/RSS/DailyCalendar"
        page = self.get(url).text
        feed = feedparser.parse(page)
        for entry in feed['entries']:
            # The feed breaks the RSS standard by making the pubdate the
            # actual event's date, not the RSS item publish date
            when = datetime.datetime(*entry['published_parsed'][:6])
            when = pytz.utc.localize(when)

            desc = entry['summary'].split(' - ')[0]
            location = entry['summary'].split(' - ')[1]

            event = Event(name=desc,
                          start_date=when,
                          description=desc,
                          location_name=location)

            event.add_source(entry['link'])
            yield event
コード例 #24
0
    def scrape(self):
        start = dt.datetime.utcnow()
        start = start - dt.timedelta(days=10)
        end = start + dt.timedelta(days=30)

        url = URL.format(**{"from": start.strftime("%Y/%m/%d"), "til": end.strftime("%Y/%m/%d")})

        page = self.lxmlize(url)
        events = page.xpath("//ul[contains(@class, 'committee-events')]//li")

        for event in events:
            string = event.text_content()

            po = CLICK_INFO.match(event.xpath(".//span")[0].attrib["onclick"])
            if po is None:
                continue

            poid = po.groupdict()["info_id"]  # This is used to get more deetz on

            popage = self.popOverUrl(poid)
            when = dt.datetime.strptime(popage.xpath("//strong")[0].text, "%B %d, %Y @ %I:%M %p")
            who = popage.xpath("//h1")[0].text
            related = []

            for item in popage.xpath("//div"):
                t = item.text
                if t is None:
                    continue

                t = t.strip()
                for related_entity in ORD_INFO.findall(t):
                    related.append({"ord_no": related_entity, "what": t})

            e = Event(name=who, when=when, location="unknown")
            e.add_source(url)

            for o in related:
                i = e.add_agenda_item(o["what"])
                i.add_bill(o["ord_no"], note="consideration")

            yield e
コード例 #25
0
    def scrape_event(self, href):
        page = self.lxmlize(href.attrib['href'])
        what = page.xpath("//td[@id='ctl14_ctl16_tdTitleCell']")[0].text
        info = page.xpath("//div[@id='ctl14_pnlEvent']//table//table//tr")[1:]
        ret = {
            "Location:": "Unknown"
        }
        for tr in info:
            tds = tr.xpath(".//td")
            if len(tds) < 2:
                continue
            what, data = [tds.pop(0).text_content().strip() for x in range(2)]
            ret[what] = data

        agendas = page.xpath("//a[contains(@title, 'Meeting Agenda')]")
        if agendas:
            for agenda in agendas:
                print("Agenda:", agenda.attrib['href'])

        t = ret['Time:']
        start_time, end_time = t, None
        if "-" in t:
            start_time, end_time = (x.strip() for x in t.split("-", 1))

        start_time = "%s %s" % (ret['Date:'], start_time)
        dts = "%B %d, %Y %I:%M %p"
        start = dt.datetime.strptime(start_time, dts)

        end = None
        if end_time:
            end = "%s %s" % (ret['Date:'], end_time)
            end = dt.datetime.strptime(end, dts)

        kwargs = {}
        if end:
            kwargs['end'] = end

        e = Event(name=what, location=ret['Location:'], when=start,
                  **kwargs)
        e.add_source(href.attrib['href'])
        yield e
コード例 #26
0
ファイル: events.py プロジェクト: sunlightlabs/openstates
    def scrape_event(self, row):
        date_td = row.xpath('td[1]')[0]
        info_td = row.xpath('td[2]')[0]

        date = date_td.xpath('b')[0].text.strip()
        time = date_td.xpath('b/following-sibling::text()')[0].strip()

        date_and_time = "{} {}".format(date, time)
        start_date = datetime.datetime.strptime(
            date_and_time, '%m/%d/%y %I:%M %p')

        title = info_td.xpath('font[1]/strong')[0].text.strip()

        all_text = info_td.xpath('descendant-or-self::*/text()')
        notes = (line.strip() for line in all_text if line.strip())
        notes = list(notes)
        # Skip the first line, which is the title
        notes = notes[1:]
        # Split out the address
        address = notes[0]
        notes = notes[1:]
        # The rest just becomes the description
        notes = "\n".join(notes)

        event = Event(
            start_date=self._TZ.localize(start_date),
            name=title,
            location_name=address,
            description=notes
        )

        event.add_source(self.URL)

        if info_td.xpath('a[contains(font/text(),"agenda")]'):
            agenda_url = info_td.xpath('a/@href')[0]
            event.add_document(
                "Agenda",
                url=agenda_url
            )

        yield event
コード例 #27
0
    def scrape(self):
        url = "http://meetingrecords.cityofboston.gov/sirepub/meetresults.aspx"

        page = self.lxmlize(url)
        for entry in page.xpath(
                "//tr[@style='font-family: Verdana; font-size: 12px;']"):
            name, when, links = entry.xpath(".//td")
            name = name.text.strip().replace(u"\xc2\xa0", "")
            when = when.text.strip().replace(u"\xc2\xa0", "")
            when = dt.datetime.strptime(when, "%m/%d/%Y")
            links = links.xpath(".//a")
            links = {x.text: x.attrib['href'] for x in links}
            e = Event(name=name,
                      when=when,
                      location='unknown')

            e.add_source(url)
            for note, url in links.items():
                e.add_link(note=note, url=url)

            yield e
コード例 #28
0
ファイル: events.py プロジェクト: sunlightlabs/openstates
    def scrape_upper(self):
        listing_url = 'https://www.senate.mo.gov/hearingsschedule/hrings.htm'

        html = self.get(listing_url).text

        # The HTML here isn't wrapped in a container per-event
        # which makes xpath a pain. So string split by <hr>
        # then parse each event's fragment for cleaner results
        for fragment in html.split('<hr />')[1:]:
            page = lxml.html.fromstring(fragment)

            when_date = self.row_content(page, 'Date:')
            when_time = self.row_content(page, 'Time:')
            location = self.row_content(page, 'Room:')

            location = '{}, {}'.format(
                location,
                '201 W Capitol Ave, Jefferson City, MO 65101'
            )

            # com = self.row_content(page, 'Committee:')
            com = page.xpath('//td[descendant::b[contains(text(),"Committee")]]/a/text()')[0]
            com = com.split(', Senator')[0].strip()

            start_date = self._TZ.localize(
                dateutil.parser.parse('{} {}'.format(when_date, when_time))
            )

            event = Event(
                start_date=start_date,
                name=com,
                location_name=location
            )

            event.add_source(listing_url)

            event.add_participant(
                com,
                type='committee',
                note='host',
            )

            for bill_table in page.xpath('//table[@width="85%" and @border="0"]'):
                bill_link = ''
                if bill_table.xpath(self.bill_link_xpath):
                    agenda_line = bill_table.xpath('string(tr[2])').strip()
                    agenda_item = event.add_agenda_item(description=agenda_line)

                    bill_link = bill_table.xpath(self.bill_link_xpath)[0].strip()
                    agenda_item.add_bill(bill_link)
                else:
                    agenda_line = bill_table.xpath('string(tr[1])').strip()
                    agenda_item = event.add_agenda_item(description=agenda_line)

            yield event
コード例 #29
0
ファイル: test_event_importer.py プロジェクト: Vanuan/pupa
def test_full_event():
    j = Jurisdiction.objects.create(id='jid', division_id='did')
    event = ScrapeEvent(name="America's Birthday", start_time="2014-07-04", location="America",
                        all_day=True)
    event.add_person("George Washington")
    event.add_media_link("fireworks", "http://example.com/fireworks.mov")

    EventImporter('jid').import_data([event.as_dict()])
コード例 #30
0
ファイル: events.py プロジェクト: sunlightlabs/openstates
    def scrape(self):

        EVENTS_URL = 'http://www.legislature.state.al.us/aliswww/ISD/InterimMeetings.aspx'
        rows = self.lxmlize(EVENTS_URL).xpath(
            '//table[@id="ContentPlaceHolder1_gvInterimMeeting"]/tr')
        for row in rows[1:]:
            date = row.xpath('td')[0].text_content().strip()
            time = row.xpath('td')[1].text_content().strip()

            date_with_time = '{} {}'.format(date, time)

            location = row.xpath('td')[2].text_content().strip()

            # 11 South Union Street, Montgomery, Alabama, United States
            # TODO: IF location is "room (X)" add state house
            # TODO: REplace "state house" with address

            # 32°22′37.294″N 86°17′57.991″W

            # host = row.xpath('td')[3].text_content().strip()
            name = row.xpath('td')[3].text_content().strip()
            details = row.xpath('td')[4].text_content().strip()

            event = Event(
                start_date=self._TZ.localize(
                    datetime.datetime.strptime(
                        date_with_time,
                        self._DATETIME_FORMAT,
                    )
                ),
                name=name,
                location_name=location,
                description=details
            )

            event.add_source(EVENTS_URL)

            yield event
コード例 #31
0
ファイル: events.py プロジェクト: jtroxell1414/openstates
    def scrape_meeting_notice(self, chamber, item, url):
        # Since Event Name is not provided for all mettings.
        event_name = str(item["CommitteeName"])
        # 04/25/2012 03:00:00 PM
        fmt = "%m/%d/%y %I:%M %p"
        start_time = dt.datetime.strptime(str(item["MeetingDateTime"]), fmt)
        location_name = str(item["AddressAliasNickname"])
        event = Event(
            location_name=location_name,
            start_date=self._tz.localize(start_time),
            name=event_name,
            description="Committee Meeting Status: {}".format(
                item["CommitteeMeetingStatusName"]),
        )

        event.add_source(url)
        event.add_committee(name=str(item["CommitteeName"]),
                            id=item["CommitteeId"])

        page_url = ("http://legis.delaware.gov/json/MeetingNotice/"
                    "GetCommitteeMeetingItems?committeeMeetingId={}".format(
                        item["CommitteeMeetingId"]))

        event.add_source(page_url)
        page_data = self.post(page_url).json()["Data"]
        for item in page_data:
            event.add_agenda_item(description=str(item["ItemDescription"]))
            event.add_person(
                name=str(item["PrimarySponsorShortName"]),
                id=str(item["PrimarySponsorPersonId"]),
                note="Sponsor",
            )

        yield event
コード例 #32
0
    def scrape(self, chamber=None, session=None):
        url = "http://leg.colorado.gov/content/committees"
        if not session:
            session = self.latest_session()
            self.info("no session specified, using %s", session)
        chambers = [chamber] if chamber else ["upper", "lower"]
        for chamber in chambers:
            if chamber == "lower":
                xpath = (
                    '//div/h3[text()="House Committees of Reference"]/../'
                    'following-sibling::div[contains(@class,"view-content")]/'
                    'table//td//span[contains(@class,"field-content")]/a/@href'
                )
            elif chamber == "upper":
                xpath = (
                    '//div/h3[text()="Senate Committees of Reference"]/../'
                    'following-sibling::div[contains(@class,"view-content")]/'
                    'table//td//span[contains(@class,"field-content")]/a/@href'
                )
            elif chamber == "other":
                # All the links under the headers that don't contain "House" or "Senate"
                xpath = (
                    '//div/h3[not(contains(text(),"House")) and '
                    'not(contains(text(),"Senate"))]/../'
                    'following-sibling::div[contains(@class,"view-content")]/'
                    'table//td//span[contains(@class,"field-content")]/a/@href'
                )

            page = self.lxmlize(url)
            com_links = page.xpath(xpath)

            for link in com_links:
                page = self.lxmlize(link)

                hearing_links = page.xpath(
                    '//div[contains(@class,"schedule-item-content")]' "/h4/a/@href"
                )

                for link in hearing_links:
                    try:
                        page = self.lxmlize(link)

                        title = page.xpath(
                            '//header/h1[contains(@class,"node-title")]'
                        )[0]
                        title = title.text_content().strip()

                        date_day = page.xpath(
                            '//div[contains(@class,"calendar-date")]'
                        )[0]
                        date_day = date_day.text_content().strip()

                        details = page.xpath(
                            '//span[contains(@class, "calendar-details")]'
                        )[0]
                        details = details.text_content().split("|")

                        date_time = details[0].strip()
                        location = details[1].strip()

                        if "Upon Adjournment" in date_time:
                            date = dt.datetime.strptime(date_day, "%A %B %d, %Y")
                        else:
                            date_str = "{} {}".format(date_day, date_time)
                            date = dt.datetime.strptime(
                                date_str, "%A %B %d, %Y %I:%M %p"
                            )

                        agendas = []
                        # they overload the bills table w/ other agenda items. colspon=2 is agenda
                        non_bills = page.xpath(
                            '//td[@data-label="Hearing Item" and @colspan="2"]'
                        )
                        for row in non_bills:
                            content = row.text_content().strip()
                            agendas.append(content)

                        agenda = "\n".join(agendas) if agendas else ""

                        event = Event(
                            name=title,
                            start_date=self._tz.localize(date),
                            location_name=location,
                        )
                        if agenda:
                            event.add_agenda_item(agenda)
                        event.add_source(link)
                        bills = page.xpath('//td[@data-label="Hearing Item"]/a')
                        for bill in bills:
                            bill_id = bill.text_content().strip()

                            item = event.add_agenda_item("hearing item")
                            item.add_bill(bill_id)

                        yield event
                    except Exception:  # TODO: this is awful
                        pass
コード例 #33
0
    def scrape(self):
        for c in house_base:
            m = {}
            m['notice'] = c.xpath('.//p/span[@class="cal_special"]/text()')
            links = c.xpath('.//h3/a/@href')            
            if len(links) > 0:
                m['cmt'] = c.xpath('.//h3/a/text()')[0]
                m['link'] = c.xpath('.//h3/a/@href')[0]
                title = c.xpath('.//h3/text()')[0]
                if title == 'Agenda:':
                    m['title'] = c.xpath('.//h3/a/text()')[0]
                else:
                    m['title'] = c.xpath('.//h3/text()')[0]
                
            else:
                m['title'] = c.xpath('.//h3/text()')[0]
                m['link'] = None
            info_div = c.xpath('.//*[@class="calendar_p_indent"]')
            if len(info_div) == 0:
                pass
            else:
                info_div = info_div[0]
            print('Info Div: ', info_div)
            if len(info_div) > 0:
                info_list = info_div.xpath('.//text()')
                info_links = info_div.xpath('.//*/@href')
                print("info links: ", info_links)
                info_list = [x.replace('\n', '').strip() for x in info_list]
                info_list = [x for x in info_list if len(x) > 0]
                print('Info list: ', info_list)
                if info_list[0].startswith('Room:'):
                    m['room'] = info_list[1]
                else:
                    m['room'] = 'n/a'
                if len(info_list) > 2:
                    if info_list[2].startswith('Chair:'):
                        chair = info_list[3]
                        if ',' in chair:
                            chairs = chair.replace('\xa0', '').split(',')
                            nchairs = []
                            for chair in chairs:
                                if chair.startswith('Rep.') or chair.startswith('Sen.'):
                                    cname = pull_middle_name(chair[4:])
                                    nchairs.append(cname.strip())
                            m['chair'] = nchairs
                        elif chair.startswith('Rep.') or chair.startswith('Sen.'):
                            cname = pull_middle_name(chair[4:].strip())
                            m['chair'] = [cname.strip()]
                else:
                    m['chair'] = None
            
            bill_rows = c.xpath(('.//*/table[@class="cal_bills"]/tbody/tr'))
            print('Bills: ', bill_rows)
            bills = []
            for brs in bill_rows:
                cells = brs.xpath('.//td')
                if len(cells) == 3:
                    b = {}
                    b['bill'] = cells[0].xpath('.//text()')[0]
                    b['author'] = cells[1].xpath('./text()')[0]
                    b['summary'] = cells[2].xpath('./text()')[0]
                    bills.append(b)
            if len(m['notice']) > 0:
                m['notice'] = m['notice'][0]
            else:
                m['notice'] = 'N/A'
            date = c.xpath('.//p/b/text()')
            if len(date) < 1:
                print('\n\n\n\n NO DATE')
                continue
            m['date'] = datetime.datetime.strptime(date[0], format1)

            if 'House Meets in Session' in m['title']:
                m['room'] = 'State leg'
                m['cmt'] = 'Minnesota House of Representatives'
                m['chair'] = None
                m['link'] = 'https://www.leg.state.mn.us/cal?type=all'
            event = Event(name=m['title'],
                          start_date=tz.localize(m['date']),
                          location_name=m['room'] 
            )
            if len(bills) > 0:
                for bill in bills:
                    nbill = event.add_agenda_item(description=bill['summary'])
                    nbill.add_bill(bill['bill'].replace('HF', 'HF '))
            if len(m['notice']) > 0:
                pass
            event.add_committee(m['cmt'])
            if m['link'] is not None:
                event.add_source(m['link'])
            if m['chair'] is not None:
                for chair in m['chair']:
                   event.add_person(name=chair, note="Chair")
            yield event
コード例 #34
0
    def scrape(self):
        last_events = deque(maxlen=10)
        for event, agenda in self.events(since=2017):
            other_orgs = ''
            extras = []

            if '--em--' in event[u'Meeting Location']:
                location_string, note = event[u'Meeting Location'].split(
                    '--em--')[:2]
                for each in note.split(' - '):
                    if each.startswith('Join'):
                        other_orgs = each
                    else:
                        extras.append(each)
            else:
                location_string = event[u'Meeting Location']

            location_list = location_string.split('-', 2)
            location = ', '.join([each.strip() for each in location_list[0:2]])
            if not location:
                continue

            when = self.toTime(event[u'Meeting Date'])

            response = self.get(event['iCalendar']['url'], verify=False)
            event_time = self.ical(
                response.text).subcomponents[0]['DTSTART'].dt
            when = when.replace(hour=event_time.hour, minute=event_time.minute)

            time_string = event['Meeting Time']
            if time_string in ('Deferred', ):
                status = 'cancelled'
            elif self.now() < when:
                status = 'confirmed'
            else:
                status = 'passed'

            description = event['Meeting\xa0Topic']
            if any(each in description for each in ('Multiple meeting items',
                                                    'AGENDA TO BE ANNOUNCED')):
                description = ''

            event_name = event['Name']

            event_id = (event_name, when)

            if event_id in last_events:
                continue
            else:
                last_events.append(event_id)

            e = Event(name=event_name,
                      start_time=when,
                      timezone=self.TIMEZONE,
                      description=description,
                      location_name=location,
                      status=status)

            if extras:
                e.extras = {'location note': ' '.join(extras)}

            if event['Multimedia'] != 'Not\xa0available':
                e.add_media_link(note='Recording',
                                 url=event['Multimedia']['url'],
                                 type="recording",
                                 media_type='text/html')

            self.addDocs(e, event, 'Agenda')
            self.addDocs(e, event, 'Minutes')

            if event['Name'] == 'City Council Stated Meeting':
                participating_orgs = ['New York City Council']
            elif 'committee' in event['Name'].lower():
                participating_orgs = [event["Name"]]
            else:
                participating_orgs = []

            if other_orgs:
                other_orgs = re.sub('Jointl*y with the ', '', other_orgs)
                participating_orgs += re.split(' and the |, the ', other_orgs)

            for org in participating_orgs:
                e.add_committee(name=org)

            if agenda:
                e.add_source(event["Meeting Details"]['url'], note='web')

                for item, _, _ in agenda:
                    if item["Name"]:
                        agenda_item = e.add_agenda_item(item["Name"])
                        if item["File\xa0#"]:
                            if item['Action']:
                                note = item['Action']
                            else:
                                note = 'consideration'
                            agenda_item.add_bill(item["File\xa0#"]['label'],
                                                 note=note)
            else:
                e.add_source(self.EVENTSPAGE, note='web')

            yield e
コード例 #35
0
    def scrape(self, window=None):
        if window:
            n_days_ago = datetime.datetime.utcnow() - datetime.timedelta(
                float(window))
        else:
            n_days_ago = None

        events = self.events(n_days_ago)

        for event, web_event in self._merge_events(events):
            body_name = event["EventBodyName"]

            if 'Board of Directors -' in body_name:
                body_name, event_name = [
                    part.strip() for part in body_name.split('-')
                ]
            else:
                event_name = body_name

            # Events can have an EventAgendaStatusName of "Final", "Final Revised",
            # and "Final 2nd Revised."
            # We classify these events as "passed."
            status_name = event['EventAgendaStatusName']
            if status_name.startswith('Final'):
                status = 'passed'
            elif status_name == 'Draft':
                status = 'confirmed'
            elif status_name == 'Canceled':
                status = 'cancelled'
            else:
                status = 'tentative'

            location = event["EventLocation"]

            if not location:
                # We expect some events to have no location. LA Metro would
                # like these displayed in the Councilmatic interface. However,
                # OCD requires a value for this field. Add a sane default.
                location = 'Not available'

            e = Event(event_name,
                      start_date=event["start"],
                      description='',
                      location_name=location,
                      status=status)

            e.pupa_id = str(event['EventId'])

            # Metro requires the EventGuid to build out MediaPlayer links.
            # Add both the English event GUID, and the Spanish event GUID if
            # it exists, to the extras dict.
            e.extras = {'guid': event['EventGuid']}

            legistar_api_url = self.BASE_URL + '/events/{0}'.format(
                event['EventId'])
            e.add_source(legistar_api_url, note='api')

            if event.get('SAPEventGuid'):
                e.extras['sap_guid'] = event['SAPEventGuid']

            if 'event_details' in event:
                # if there is not a meeting detail page on legistar
                # don't capture the agenda data from the API
                for item in self.agenda(event):
                    agenda_item = e.add_agenda_item(item["EventItemTitle"])
                    if item["EventItemMatterFile"]:
                        identifier = item["EventItemMatterFile"]
                        agenda_item.add_bill(identifier)

                    if item["EventItemAgendaNumber"]:
                        # To the notes field, add the item number as given in the agenda minutes
                        note = "Agenda number, {}".format(
                            item["EventItemAgendaNumber"])
                        agenda_item['notes'].append(note)

                    # The EventItemAgendaSequence provides
                    # the line number of the Legistar agenda grid.
                    agenda_item['extras']['item_agenda_sequence'] = item[
                        'EventItemAgendaSequence']

                # Historically, the Legistar system has duplicated the EventItemAgendaSequence,
                # resulting in data inaccuracies. The scrape should fail in such cases, until Metro
                # cleans the data.
                item_agenda_sequences = [
                    item['extras']['item_agenda_sequence'] for item in e.agenda
                ]
                if len(item_agenda_sequences) != len(
                        set(item_agenda_sequences)):
                    error_msg = 'An agenda has duplicate agenda items on the Legistar grid: \
                        {event_name} on {event_date} ({legistar_api_url}). \
                        Contact Metro, and ask them to remove the duplicate EventItemAgendaSequence.'

                    raise ValueError(
                        error_msg.format(
                            event_name=e.name,
                            event_date=e.start_date.strftime("%B %d, %Y"),
                            legistar_api_url=legistar_api_url))

            e.add_participant(name=body_name, type="organization")

            if event.get('SAPEventId'):
                e.add_source(self.BASE_URL +
                             '/events/{0}'.format(event['SAPEventId']),
                             note='api (sap)')

            if event['EventAgendaFile']:
                e.add_document(note='Agenda',
                               url=event['EventAgendaFile'],
                               media_type="application/pdf")

            if event['EventMinutesFile']:
                e.add_document(note='Minutes',
                               url=event['EventMinutesFile'],
                               media_type="application/pdf")

            for audio in event['audio']:
                try:
                    redirect_url = self.head(audio['url']).headers['Location']

                except KeyError:
                    # In some cases, the redirect URL does not yet
                    # contain the location of the audio file. Skip
                    # these events, and retry on next scrape.
                    continue

                e.add_media_link(note=audio['label'],
                                 url=redirect_url,
                                 media_type='text/html')

            if web_event['Recap/Minutes'] != 'Not\xa0available':
                e.add_document(note=web_event['Recap/Minutes']['label'],
                               url=web_event['Recap/Minutes']['url'],
                               media_type="application/pdf")

            if event['event_details']:
                for link in event['event_details']:
                    e.add_source(**link)
            else:
                e.add_source('https://metro.legistar.com/Calendar.aspx',
                             note='web')

            yield e
コード例 #36
0
    def scrape_chamber(self, chamber):
        """
        Scrape upper or lower committee agendas
        """
        # session = self.latest_session()
        # since we are scraping only latest_session
        # session_id = self.session_metadata.session_id_meta_data[session]

        # could use &ShowAll=ON doesn't seem to work though
        url = 'http://www.azleg.gov/CommitteeAgendas.asp?Body=%s' % self._chamber_short[chamber]
        html_ = self.get(url).text
        doc = html.fromstring(html_)
        if chamber == 'upper':
            event_table = doc.xpath('//table[@id="body"]/tr/td/table[2]/'
                                    'tr/td/table/tr/td/table')[0]
        else:
            event_table = doc.xpath('//table[@id="body"]/tr/td/table[2]/tr'
                                    '/td/table/tr/td/table/tr/td/table')[0]
        for row in event_table.xpath('tr')[2:]:
            # Agenda Date, Committee, Revised, Addendum, Cancelled, Time, Room,
            # HTML Document, PDF Document for house
            # Agenda Date, Committee, Revised, Cancelled, Time, Room,
            # HTML Document, PDF Document for senate
            text = [x.text_content().strip() for x in row.xpath('td')]
            when, committee = text[0:2]
            if chamber == 'upper':
                time, room = text[4:6]
                link = row[6].xpath('string(a/@href)')
            else:
                time, room = text[5:7]
                link = row[7].xpath('string(a/@href)')
            if 'NOT MEETING' in time or 'CANCELLED' in time:
                continue
            time = re.match('(\d+:\d+ (A|P))', time)
            if time:
                when = "%s %sM" % (text[0], time.group(0))
                when = datetime.datetime.strptime(when, '%m/%d/%Y %I:%M %p')
            else:
                when = text[0]
                when = datetime.datetime.strptime(when, '%m/%d/%Y')

            title = "Committee Meeting:\n%s %s %s\n" % (
                                              self._chamber_long[chamber],
                                              committee, room)
            agenda_info = self.parse_agenda(chamber, link)

            description = agenda_info['description']
            member_list = agenda_info['member_list']
            related_bills = agenda_info['related_bills']
            print(related_bills)
            """
            event = Event(session, when, 'committee:meeting', title,
                          location=room, link=link, details=description,
                          related_bills=related_bills)
            """
            event = Event(location_name=room,
                          start_date=self._tz.localize(when),
                          name=title,
                          description=description,
                          )
            event.add_participant(committee, type='committee', note='host')

            event.participants.extend(member_list)
            event.add_source(url)
            event.add_source(link)
            # print event['when'].timetuple()
            # import ipdb;ipdb.set_trace()
            yield event
コード例 #37
0
    def scrape(self, chamber=None):
        URL = 'http://utahlegislature.granicus.com/ViewPublisherRSS.php?view_id=2&mode=agendas'
        doc = self.lxmlize(URL)
        events = doc.xpath('//item')

        for info in events:
            title_and_date = info.xpath('title/text()')[0].split(" - ")
            title = title_and_date[0]
            when = title_and_date[-1]
            # if not when.endswith(session[ :len("20XX")]):
            #    continue

            event = Event(name=title,
                          start_date=self._tz.localize(
                              datetime.datetime.strptime(when, '%b %d, %Y')),
                          location_name='State Capitol')
            event.add_source(URL)

            url = re.search(r'(http://.*?)\s', info.text_content()).group(1)
            try:
                doc = self.lxmlize(url)
            except HTTPError:
                self.logger.warning("Page missing, skipping")
                continue
            event.add_source(url)

            committee = doc.xpath('//a[text()="View committee page"]/@href')
            if committee:
                committee_doc = self.lxmlize(committee[0])
                committee_name = committee_doc.xpath(
                    '//h3[@class="heading committee"]/text()')[0].strip()
                event.add_participant(committee_name,
                                      type='committee',
                                      note='host')

            documents = doc.xpath('.//td')
            for document in documents:
                url = re.search(r'(http://.*?pdf)',
                                document.xpath('@onclick')[0])
                if url is None:
                    continue
                url = url.group(1)
                event.add_document(note=document.xpath('text()')[0],
                                   url=url,
                                   media_type='application/pdf')
                bills = document.xpath('@onclick')
                for bill in bills:
                    if "bills/static" in bill:
                        bill_name = bill.split("/")[-1].split(".")[0]
                        item = event.add_agenda_item('Bill up for discussion')
                        item.add_bill(bill_name)
            yield event
コード例 #38
0
ファイル: events.py プロジェクト: unfold-inc/openstates
    def scrape_upper(self):
        PDF_URL = 'http://www.ohiosenate.gov/Assets/CommitteeSchedule/calendar.pdf'
        (path, _response) = self.urlretrieve(PDF_URL)
        text = convert_pdf(path, type='text').decode()
        os.remove(path)

        days = re.split(r'(\w+day, \w+ \d{1,2})', text)
        date = None
        for day in enumerate(days[1:]):
            if day[0] % 2 == 0:
                # Calendar is put out for the current week, so use that year
                date = day[1] + ", " + str(datetime.datetime.now().year)
            else:

                events = re.split(r'\n\n((?:\w+\s?)+),\s', day[1])
                comm = ''
                for event in enumerate(events[1:]):
                    if event[0] % 2 == 0:
                        comm = event[1].strip()
                    else:

                        try:
                            (time, location, description) = re.search(
                                r'''(?mxs)
                                    (\d{1,2}:\d{2}\s[AP]M)  # Meeting time
                                    .*?,\s  # Potential extra text for meeting time
                                    (.*?)\n  # Location, usually a room
                                    .*?\n  # Chairman of committee holding event
                                    (.*)  # Description of event
                                    ''', event[1]).groups()
                        except AttributeError:
                            continue

                        time = datetime.datetime.strptime(
                            time + "_" + date, '%I:%M %p_%A, %B %d, %Y')
                        time = self._tz.localize(time)

                        location = location.strip()

                        description = '\n'.join([
                            x.strip() for x in description.split('\n')
                            if x.strip() and not x.strip().startswith("Page ")
                            and not x.strip().startswith("*Possible Vote") and
                            not x.strip() == "NO OTHER COMMITTEES WILL MEET"
                        ])

                        if not description:
                            description = '[No description provided by state]'

                        event = Event(name=description,
                                      start_date=time,
                                      location_name=location,
                                      description=description)

                        event.add_source(PDF_URL)
                        event.add_participant(comm,
                                              type='committee',
                                              note='host')
                        for line in description.split('\n'):
                            related_bill = re.search(
                                r'(S\.?(?:[JC]\.?)?[BR]\.?\s+\d+)\s+(.*)$',
                                line)
                            if related_bill:
                                (related_bill,
                                 relation) = related_bill.groups()
                                relation = relation.strip()
                                related_bill = related_bill.replace(".", "")
                                item = event.add_agenda_item(relation)
                                item.add_bill(related_bill)

                        yield event
コード例 #39
0
    def scrape(self, window=None) :
        if window:
            n_days_ago = datetime.datetime.utcnow() - datetime.timedelta(float(window))
        else:
            n_days_ago = None

        events = self.events(n_days_ago)

        for event, web_event in self._merge_events(events):
            body_name = event["EventBodyName"]

            if 'Board of Directors -' in body_name:
                body_name, event_name = [part.strip()
                                         for part
                                         in body_name.split('-')]
            else:
                event_name = body_name

            status_name = event['EventAgendaStatusName']
            if status_name == 'Draft':
                status = 'confirmed'
            elif status_name == 'Final':
                status = 'passed'
            elif status_name == 'Canceled':
                status = 'cancelled'
            else:
                status = 'tentative'

            location = event["EventLocation"]

            if not location:
                # We expect some events to have no location. LA Metro would
                # like these displayed in the Councilmatic interface. However,
                # OCD requires a value for this field. Add a sane default.
                location = 'Not available'

            e = Event(event_name,
                      start_date=event["start"],
                      description='',
                      location_name=location,
                      status=status)

            e.pupa_id = str(event['EventId'])

            # Metro requires the EventGuid to build out MediaPlayer links.
            # Add both the English event GUID, and the Spanish event GUID if
            # it exists, to the extras dict.
            e.extras = {'guid': event['EventGuid']}

            if event.get('SAPEventGuid'):
                e.extras['sap_guid'] = event['SAPEventGuid']


            if 'event_details' in event:
                # if there is not a meeting detail page on legistar
                # don't capture the agenda data from the API
                for item in self.agenda(event):
                    agenda_item = e.add_agenda_item(item["EventItemTitle"])
                    if item["EventItemMatterFile"]:
                        identifier = item["EventItemMatterFile"]
                        agenda_item.add_bill(identifier)

                    if item["EventItemAgendaNumber"]:
                        # To the notes field, add the item number as given in the agenda minutes
                        note = "Agenda number, {}".format(item["EventItemAgendaNumber"])
                        agenda_item['notes'].append(note)

            e.add_participant(name=body_name,
                              type="organization")

            e.add_source(self.BASE_URL + '/events/{0}'.format(event['EventId']),
                         note='api')

            if event.get('SAPEventId'):
                e.add_source(self.BASE_URL + '/events/{0}'.format(event['SAPEventId']),
                             note='api (sap)')

            if event['EventAgendaFile']:
                e.add_document(note= 'Agenda',
                               url = event['EventAgendaFile'],
                               media_type="application/pdf")

            if event['EventMinutesFile']:
                e.add_document(note= 'Minutes',
                               url = event['EventMinutesFile'],
                               media_type="application/pdf")

            for audio in event['audio']:
                try:
                    redirect_url = self.head(audio['url']).headers['Location']

                except KeyError:
                    # In some cases, the redirect URL does not yet
                    # contain the location of the audio file. Skip
                    # these events, and retry on next scrape.
                    continue

                e.add_media_link(note=audio['label'],
                                 url=redirect_url,
                                 media_type='text/html')

            if web_event['Recap/Minutes'] != 'Not\xa0available':
                e.add_document(note=web_event['Recap/Minutes']['label'],
                               url=web_event['Recap/Minutes']['url'],
                               media_type="application/pdf")

            if event['event_details']:
                for link in event['event_details']:
                    e.add_source(**link)
            else:
                e.add_source('https://metro.legistar.com/Calendar.aspx', note='web')

            yield e
コード例 #40
0
ファイル: events.py プロジェクト: kozmikyak/actibase
    def scrape(self):

        current_date = datetime.today()
        current_month = current_date.month
        current_year = current_date.year

        date_range = []

        print(current_month)

        for x in range(0, 4):
            if not current_month == 12:
                cm = current_month
                if len(str(cm)) < 2:
                    cm = '0{0}'.format(cm)
                    timestamp = "{0}-{1}".format(current_year, cm)
                    date_range.append(timestamp)
                    current_month += 1

            elif current_month == 12:
                cm = '12'
                timestamp = "{0}-{1}".format(current_year, cm)
                date_range.append(timestamp)
                current_month = 1
                current_year += 1

        format1 = "%A %B %d, %Y - %I:%M %p"
        format2 = "%A %B %d, %Y - "
        format3 = "%m/%d/%y"
        for date in date_range:
            root = requests.get("https://www.stpaul.gov/calendar/" + date)
            base = html.fromstring(root.text)
            items = base.xpath('.//*/div[@class="view-content"]/div')
            meetings = []
            for i in items:
                if len(
                        i.xpath(
                            './/*/span[@class="date-display-single"]/text()')
                ) > 0:
                    d = {}
                    d['date'] = i.xpath(
                        './/*/span[@class="date-display-single"]/text()')[0]
                    d['info'] = i.xpath(
                        './/*/span[@class="field-content"]/a/text()')[0]
                    d['link'] = i.xpath(
                        './/*/span[@class="field-content"]/a/@href')[0]
                    meetings.append(d)
            for m in meetings:
                m['link'] = "https://www.stpaul.gov" + m['link']
            for m in meetings:
                ppr(m['info'])
                r = requests.get(m['link'])
                b = html.fromstring(r.text)
                exists = b.xpath('.//div[@class="node-content clearfix"]')
                if len(exists) > 0:
                    date = exists[0].xpath(
                        './/*/span[@class="date-display-single"]/text()')
                    loc1 = exists[0].xpath(
                        './/*/div[@class="thoroughfare"]/text()')
                    loc2 = exists[0].xpath('.//*/div[@class="premise"]/text()')
                    if len(loc1) > 0:
                        m['location'] = loc1[0]
                    if len(loc2) > 0:
                        m['location'] = m['location'] + " " + loc2[0]
                    else:
                        m['location'] = 'N/A'
                    if ":" in date[0]:
                        m['date'] = datetime.strptime(date[0], format1)
                    elif "/" in date[0]:
                        new_date = date[0].split('/')
                        for n in new_date:
                            if len(n) == 1:
                                n = '0' + n
                                new_date = '/'.join(new_date)
                                m['date'] = datetime.strptime(
                                    new_date, format3)
                    else:
                        date = datetime.strptime(date[0], format2)
                        m['date'] = date
                    m['date'] = tz.localize(m['date'])
                    if not 'City Council' in m[
                            'info'] and not 'Legislative' in m[
                                'info'] and not 'Holiday' in m['info']:

                        event = Event(name=m['info'].strip(),
                                      start_date=m['date'],
                                      location_name=m['location'])
                        m['name'] = m['info'].replace('Meeting', '').replace(
                            ' - Cancelled', '').replace('Events', '').strip()
                        event.add_committee(m['name'])
                    elif 'Holiday' in m['info']:
                        event = Event(name=m['info'].strip(),
                                      start_date=m['date'],
                                      location_name=m['location'])
                    else:
                        event = Event(name=m['info'].strip(),
                                      start_date=m['date'],
                                      location_name=m['location'])
                        event.add_committee('Saint Paul City Council')
                    event.add_source(m['link'])
                    yield event
コード例 #41
0
ファイル: events.py プロジェクト: unfold-inc/openstates
    def scrape_lower(self):
        PDF_URL = 'http://www.ohiohouse.gov/Assets/CommitteeSchedule/calendar.pdf'
        (path, _response) = self.urlretrieve(PDF_URL)
        text = convert_pdf(path, type='text-nolayout').decode()
        os.remove(path)

        days = re.split(r'(\wF+day, \w+ \d{1,2}, 20\d{2})', text)
        date = None
        for day in enumerate(days[1:]):
            if day[0] % 2 == 0:
                date = day[1]
            else:

                events = re.split(r'\n((?:\w+\s?)+)\n', day[1])
                comm = ''
                for event in enumerate(events[1:]):
                    if event[0] % 2 == 0:
                        comm = event[1].strip()
                    else:

                        try:
                            (time, location, description) = re.search(
                                r'''(?mxs)
                                    (\d{1,2}:\d{2}\s[ap]\.m\.)  # Meeting time
                                    .*?,\s  # Potential extra text for meeting time
                                    (.*?),\s  # Location, usually a room
                                    .*?\n  # Chairman of committee holding event
                                    (.*)  # Description of event
                                    ''', event[1]).groups()
                        except AttributeError:
                            continue

                        time = time.replace(".", "").upper()
                        time = datetime.datetime.strptime(
                            time + "_" + date, '%I:%M %p_%A, %B %d, %Y')
                        time = self._tz.localize(time)

                        location = location.strip()

                        description = '\n'.join([
                            x.strip() for x in description.split('\n')
                            if x.strip() and not x.strip()[0].isdigit()
                        ])

                        if not description:
                            description = '[No description provided by state]'

                        event = Event(name=description,
                                      start_date=time,
                                      location_name=location,
                                      description=description)
                        event.add_source(PDF_URL)
                        event.add_participant(comm,
                                              type='committee',
                                              note='host')
                        for line in description.split('\n'):
                            related_bill = re.search(
                                r'(H\.?(?:[JC]\.?)?[BR]\.?\s+\d+)\s+(.*)$',
                                line)
                            if related_bill:
                                (related_bill,
                                 relation) = related_bill.groups()
                                relation = relation.strip()
                                related_bill = related_bill.replace(".", "")
                                item = event.add_agenda_item(relation)
                                item.add_bill(related_bill)

                        yield event
コード例 #42
0
    def scrape(self, window=3):
        n_days_ago = datetime.datetime.utcnow() - datetime.timedelta(float(window))
        for api_event, event in self.events(n_days_ago):

            when = api_event['start']
            location = api_event['EventLocation']

            description = event['Meeting\xa0Topic']

            if any(each in description
                   for each
                   in ('Multiple meeting items',
                       'AGENDA TO BE ANNOUNCED')) :
                description = None

            if description:
                e = Event(name=api_event["EventBodyName"],
                          start_date=when,
                          description=description,
                          location_name=location,
                          status=api_event['status'])
            else:
                e = Event(name=api_event["EventBodyName"],
                          start_date=when,
                          location_name=location,
                          status=api_event['status'])

            e.pupa_id = str(api_event['EventId'])

            if event['Multimedia'] != 'Not\xa0available' :
                e.add_media_link(note='Recording',
                                 url = event['Multimedia']['url'],
                                 type="recording",
                                 media_type = 'text/html')

            self.addDocs(e, event, 'Agenda')
            self.addDocs(e, event, 'Minutes')

            location_string = event[u'Meeting Location']
            location_notes, other_orgs = self._parse_location(location_string)

            if location_notes:
                e.extras = {'location note': ' '.join(location_notes)}

            if e.name == 'City Council Stated Meeting' :
                participating_orgs = ['New York City Council']
            elif 'committee' in e.name.lower() :
                participating_orgs = [e.name]
            else :
                participating_orgs = []

            if other_orgs :
                other_orgs = re.sub('Jointl*y with the ', '', other_orgs)
                participating_orgs += re.split(' and the |, the ', other_orgs)

            for org in participating_orgs :
                e.add_committee(name=org)

            for item in self.agenda(api_event):
                agenda_item = e.add_agenda_item(item["EventItemTitle"])
                if item["EventItemMatterFile"]:
                    identifier = item["EventItemMatterFile"]
                    agenda_item.add_bill(identifier)

            participants = set()

            for call in self.rollcalls(api_event):
                if call['RollCallValueName'] == 'Present':
                    participants.add(call['RollCallPersonName'].strip())

            for person in participants:
                e.add_participant(name=person,
                                  type="person")

            e.add_source(self.BASE_URL + '/events/{EventId}'.format(**api_event),
                         note='api')

            try:
                detail_url = event['Meeting Details']['url']
            except TypeError:
                e.add_source(self.EVENTSPAGE, note='web')
            else:
                if requests.head(detail_url).status_code == 200:
                    e.add_source(detail_url, note='web')

            yield e