コード例 #1
0
    def scrape(self):
        url = "http://meetingrecords.cityofboston.gov/sirepub/meetresults.aspx"

        page = self.lxmlize(url)
        for entry in page.xpath(
                "//tr[@style='font-family: Verdana; font-size: 12px;']"):
            name, when, links = entry.xpath(".//td")
            name = name.text.strip().replace(u"\xc2\xa0", "")
            when = when.text.strip().replace(u"\xc2\xa0", "")
            when = dt.datetime.strptime(when, "%m/%d/%Y")
            links = links.xpath(".//a")
            links = {x.text: x.attrib['href'] for x in links}
            e = Event(name=name, when=when, location='unknown')

            e.add_source(url)
            for note, url in links.items():
                e.add_link(note=note, url=url)

            yield e
コード例 #2
0
    def scrape(self):
        url = "http://meetingrecords.cityofboston.gov/sirepub/meetresults.aspx"

        page = self.lxmlize(url)
        for entry in page.xpath(
                "//tr[@style='font-family: Verdana; font-size: 12px;']"):
            name, when, links = entry.xpath(".//td")
            name = name.text.strip().replace(u"\xc2\xa0", "")
            when = when.text.strip().replace(u"\xc2\xa0", "")
            when = dt.datetime.strptime(when, "%m/%d/%Y")
            links = links.xpath(".//a")
            links = {x.text: x.attrib['href'] for x in links}
            e = Event(name=name,
                      when=when,
                      location='unknown')

            e.add_source(url)
            for note, url in links.items():
                e.add_link(note=note, url=url)

            yield e
コード例 #3
0
    def scrape(self):
        url = "http://phila.legistar.com/Calendar.aspx/"
        page = self.lxmlize(url)
        main = page.xpath("//table[@class='rgMasterTable']")[0]
        rows = main.xpath(".//tr")[1:]
        for row in rows:
            if "No records were found." in row.text_content():
                self.warning("Hum. They don't seem to have events?")
                continue

            (name, date, _, time, where, agenda, minutes) = row.xpath(".//td")
            # _ nom's the image next to the date on the page.

            name = name.text_content().strip()  # leaving an href on the table
            time = time.text_content().strip()
            location = where.text_content().strip()

            if "Deferred" in time:
                continue

            all_day = False
            if time == "":
                all_day = True
                when = dt.datetime.strptime(date.text.strip(), "%m/%d/%Y")
            else:
                when = dt.datetime.strptime(
                    "%s %s" % (date.text.strip(), time), "%m/%d/%Y %I:%M %p")

            event = Event(name=name, when=when, location=location)
            event.add_source(url)

            agendas = agenda.xpath(".//a[@href]")
            for a in agendas:
                event.add_link(a.text, a.attrib['href'])

            minutes = minutes.xpath(".//a[@href]")
            for minute in minutes:
                event.add_link(minute.text, minute.attrib['href'])

            yield event
コード例 #4
0
    def scrape(self):
        meetings_html = self.urlopen(self.ARLINGTON_MEETING_PAGE)
        meetings_lxml = lxml.html.fromstring(meetings_html)
        
        for meeting_type in ('archive', 'upcoming'):
            for meeting in meetings_lxml.cssselect('#%s tbody tr' % meeting_type):
                
                # attempt to map the cells across table types. 
                # if the sizes mismatch, ignore this one (it's an "empty" message)
                try:
                    cell_mapping = self._organize_cells(meeting_type, meeting.cssselect('td'))
                except:
                    continue

                meeting_title = cell_mapping['title'].text
                meeting_date = datetime.datetime.fromtimestamp(int(cell_mapping['date'].cssselect('span')[0].text))

                e = Event(name=meeting_title, when=meeting_date, location='unknown')
                e.add_source(self.ARLINGTON_MEETING_PAGE)                

                # detect agenda url, if present
                meeting_agenda_url = None
                if len(cell_mapping['agenda'].cssselect('a'))>0:
                    meeting_agenda_url = cell_mapping['agenda'].cssselect('a')[0].attrib.get('href')

                # follow the agenda URL and attempt to extract associated documents
                if meeting_agenda_url is not None:
                    e.add_link(meeting_agenda_url)
                    e.add_document(name='Agenda', url=meeting_agenda_url, mimetype='text/html')                    

                    meeting_agenda_html = self.urlopen(meeting_agenda_url)
                    meeting_agenda_lxml = lxml.html.fromstring(meeting_agenda_html)
                    for link in meeting_agenda_lxml.cssselect('a'):
                        link_url = link.attrib.get('href','')
                        if not len(link_url):
                            continue
                        if 'metaviewer.php' in link_url.lower():
                            # NOTE: application/pdf is a guess, may not always be correct
                            if link.text is not None:
                                e.add_document(name=link.text, url=link_url, mimetype='application/pdf') 

                # skip everything below here for the 'upcoming' table
                if meeting_type=='upcoming':
                    continue

                # detect video
                # TODO: extract actual mp4 files
                video_cell = cell_mapping['video'].cssselect('a')
                if len(video_cell)>0:
                    video_url_match = re.search(r"http://(.*?)'", video_cell[0].attrib.get('onclick',''))
                    if video_url_match is not None:
                        e.add_media_link(name="Video", url=video_url_match.group(0), mimetype='text/html')

                # detect audio
                audio_cell = cell_mapping['audio'].cssselect('a')
                if len(audio_cell)>0:
                    e.add_media_link(name="Audio", url=audio_cell[0].attrib.get('href', ''), mimetype='audio/mpeg')

                # detect minutes
                minutes_cell = cell_mapping['minutes'].cssselect('a')
                if len(minutes_cell)>0:
                    e.add_media_link(name="Minutes", url=minutes_cell[0].attrib.get('href', ''), mimetype='text/html')

                yield e
コード例 #5
0
    def scrape(self):
        meetings_html = self.urlopen(self.ARLINGTON_MEETING_PAGE)
        meetings_lxml = lxml.html.fromstring(meetings_html)

        for meeting_type in ('archive', 'upcoming'):
            for meeting in meetings_lxml.cssselect('#%s tbody tr' %
                                                   meeting_type):

                # attempt to map the cells across table types.
                # if the sizes mismatch, ignore this one (it's an "empty" message)
                try:
                    cell_mapping = self._organize_cells(
                        meeting_type, meeting.cssselect('td'))
                except:
                    continue

                meeting_title = cell_mapping['title'].text
                meeting_date = datetime.datetime.fromtimestamp(
                    int(cell_mapping['date'].cssselect('span')[0].text))

                e = Event(name=meeting_title,
                          when=meeting_date,
                          location='unknown')
                e.add_source(self.ARLINGTON_MEETING_PAGE)

                # detect agenda url, if present
                meeting_agenda_url = None
                if len(cell_mapping['agenda'].cssselect('a')) > 0:
                    meeting_agenda_url = cell_mapping['agenda'].cssselect(
                        'a')[0].attrib.get('href')

                # follow the agenda URL and attempt to extract associated documents
                if meeting_agenda_url is not None:
                    e.add_link(meeting_agenda_url)
                    e.add_document(name='Agenda',
                                   url=meeting_agenda_url,
                                   mimetype='text/html')

                    meeting_agenda_html = self.urlopen(meeting_agenda_url)
                    meeting_agenda_lxml = lxml.html.fromstring(
                        meeting_agenda_html)
                    for link in meeting_agenda_lxml.cssselect('a'):
                        link_url = link.attrib.get('href', '')
                        if not len(link_url):
                            continue
                        if 'metaviewer.php' in link_url.lower():
                            # NOTE: application/pdf is a guess, may not always be correct
                            if link.text is not None:
                                e.add_document(name=link.text,
                                               url=link_url,
                                               mimetype='application/pdf')

                # skip everything below here for the 'upcoming' table
                if meeting_type == 'upcoming':
                    continue

                # detect video
                # TODO: extract actual mp4 files
                video_cell = cell_mapping['video'].cssselect('a')
                if len(video_cell) > 0:
                    video_url_match = re.search(
                        r"http://(.*?)'",
                        video_cell[0].attrib.get('onclick', ''))
                    if video_url_match is not None:
                        e.add_media_link(name="Video",
                                         url=video_url_match.group(0),
                                         mimetype='text/html')

                # detect audio
                audio_cell = cell_mapping['audio'].cssselect('a')
                if len(audio_cell) > 0:
                    e.add_media_link(name="Audio",
                                     url=audio_cell[0].attrib.get('href', ''),
                                     mimetype='audio/mpeg')

                # detect minutes
                minutes_cell = cell_mapping['minutes'].cssselect('a')
                if len(minutes_cell) > 0:
                    e.add_media_link(name="Minutes",
                                     url=minutes_cell[0].attrib.get(
                                         'href', ''),
                                     mimetype='text/html')

                yield e