コード例 #1
0
    def parse(self, response):
        soup = BeautifulSoup(response.text, 'lxml')

        depth = response.meta['depth'] or 0

        event_elms = soup.select('#Events div.event')
        for elm in event_elms:
            event_summary = ' '.join(elm.select_one('h2').stripped_strings)
            info = list(elm.select_one('.info').stripped_strings)
            event_dt = self.__parse_dt(info[0])
            event_location = '\n'.join(info[1:])
            event_desc = '\n'.join(
                (utils.clean_html(str(e))
                 for e in elm.select_one('.info').find_next_siblings()
                 if e.attrs.get('class') != ['up']))
            yield CalEventItem(date=event_dt,
                               summary=event_summary,
                               description=event_desc,
                               location=event_location)

        if depth == 0:
            link = soup.select_one('.peernav .next a')
            request = scrapy.Request(link['href'])
            request.meta['depth'] = depth + 1
            yield request
コード例 #2
0
ファイル: brooklyn-cb9.py プロジェクト: codebutler/59boards
    def parse_event(self, response):
        soup = BeautifulSoup(response.text, 'lxml')

        elems = soup.select('.meeting_wrap p')
        data = {
            key.string.rstrip(':').strip(): val.string.strip()
            for key, val in (elem.children for elem in elems)
        }

        date_str = data['Date']
        time_str = data['Time']
        venue = data['Venue']
        address = data['Address']

        if not date_str or not time_str or time_str == 'CANCELLED':
            return

        date_str = utils.strip_date_ords(date_str)
        time_str = time_str.replace('.', '')

        event_date = datetime.strptime(date_str, '%B %d, %Y').date()
        event_time = self.__parse_time(time_str)

        event_dt = timezone('US/Eastern').localize(
            datetime.combine(event_date, event_time))
        event_summary = soup.select('.et_main_title')[0].text.strip()
        event_description = response.url
        event_location = '\n'.join([x for x in (venue, address) if x])

        yield CalEventItem(date=event_dt,
                           summary=event_summary,
                           description=event_description,
                           location=event_location)
コード例 #3
0
    def parse(self, response):
        soup = BeautifulSoup(response.text, 'lxml')
        title_elms = soup.select('.about-description h2, .about-description h3')

        for elm in title_elms:
            siblings = list(itertools.islice(
                filter(
                    lambda x: x.encode().strip() and x.name != 'br',
                    elm.next_siblings),
                3))

            # Ensure next 3 elements (ignoring whitespace/<br>s) are text.
            if len(siblings) != 3 or not all([x.name is None for x in siblings]):
                continue

            summary_text = elm.text
            date_text = str(siblings[0]).strip()
            time_text = str(siblings[1]).strip()
            location_text = str(siblings[2]).strip()

            agenda_elm = elm.find_next_sibling('a')
            agenda_href = urljoin(self.start_urls[0], agenda_elm.attrs['href'])
            agenda_text = agenda_elm.text
            description_text = f'<a href="{agenda_href}>{agenda_text}</a>'

            event_date = self.__parse_date(date_text)
            event_time = self.__parse_time(time_text)
            event_dt = datetime.combine(event_date, event_time)

            yield CalEventItem(
                date=event_dt,
                summary=summary_text,
                description=description_text,
                location=location_text
            )
コード例 #4
0
ファイル: bronx-cb2.py プロジェクト: codebutler/59boards
 def parse(self, response):
     cal = Calendar.from_ical(response.body_as_unicode())
     for vevent in cal.subcomponents:
         event_dt = timezone('US/Eastern').localize(
             vevent.get('DTSTART').dt)
         event_id = vevent.get('UID')
         event_summary = vevent.get('SUMMARY')
         event_location = vevent.get('LOCATION')
         event_url = vevent.get('URL')
         yield CalEventItem(id=event_id,
                            date=event_dt,
                            summary=event_summary,
                            description=event_url,
                            location=event_location)
コード例 #5
0
ファイル: queens-cb3.py プロジェクト: codebutler/59boards
    def parse_event_ical(self, response):
        cal = Calendar.from_ical(response.body_as_unicode())
        for vevent in cal.subcomponents:
            event_dt = vevent.get('DTSTART').dt
            event_summary = vevent.get('SUMMARY')
            event_description = vevent.get('DESCRIPTION')
            event_location = response.meta.get('location', None)

            if event_summary == 'Alternate Side Parking Rules Suspended':
                continue

            yield CalEventItem(date=event_dt,
                               summary=event_summary,
                               description=event_description,
                               location=event_location)
コード例 #6
0
    def parse(self, response):
        soup = BeautifulSoup(response.text, 'lxml')
        tag = soup.select('.highlight_bodytext')[0]  # type: Tag
        lines = []
        current = ""

        for child in tag.descendants:  # type: Tag
            if isinstance(child, NavigableString):
                current += child.string.strip('\n').replace(u'\xa0', ' ')
            if child.name == 'br':
                if current:
                    lines.append(current)
                current = ""
        if current:
            lines.append(current)

        lines = [line.strip() for line in lines if line.strip()]

        current_month = None
        current_day = None
        current_year = None
        text_buffer = []

        for index, text in enumerate(lines):
            tokens = re.split(r'\W+', text)
            is_date = tokens[0] in DAY_NAMES and tokens[1] in MONTH_NAMES and tokens[2].isdigit()
            is_year = tokens[0] in MONTH_NAMES and tokens[1].isdigit()
            is_last = index == len(lines) - 1
            if is_date:
                current_month = MONTH_NAMES.index(tokens[1]) + 1
                current_day = int(tokens[2])
            elif is_year:
                current_year = int(tokens[1])
            else:
                text_buffer.append(text)
            if (is_date or is_year or is_last) and text_buffer:
                summary = text_buffer[0]
                location = text_buffer[1] if len(text_buffer) > 1 else None
                yield CalEventItem(
                    date=datetime(year=current_year, month=current_month, day=current_day),
                    summary=summary,
                    description=None,
                    location=location
                )
                text_buffer = []
コード例 #7
0
ファイル: brooklyn-cb3.py プロジェクト: codebutler/59boards
    def parse(self, response):
        pdf = json.loads(response.body_as_unicode())

        data = pdf['pages'][0]['tables'][0]['data']
        for event in data:
            committee = event['COMMITTEE']
            chair = event['CHAIR/VICE-CHAIR/CO-CHAIR']
            time = event['TIME']
            date = event['DATE']

            event_time = datetime.strptime(time, '%I:%M %p').time()
            event_date = datetime.strptime(date, '%d-%b-%y').date()
            event_dt = datetime.combine(event_date, event_time)

            yield CalEventItem(
                date=event_dt,
                summary=committee,
                description=chair,
                location=None
            )
コード例 #8
0
ファイル: brooklyn-cb6.py プロジェクト: codebutler/59boards
    def parse(self, response):
        soup = BeautifulSoup(response.text, 'lxml')

        for tag in soup.select('.about-description > h3'):
            # Find index of next event header.
            all_siblings = tag.select('~ *')
            next_header = tag.select_one('~ hr, ~ h3, ~ h2')

            # Find all siblings up to next header (or end of document if last event).
            event_tags = tag.select('~ *', limit=all_siblings.index(next_header)) \
                if next_header else all_siblings

            # Use <h3> text as date.
            event_date = self.__parse_date(tag.string)

            # Find first <h4> sibling.
            event_summary = next(
                (t.string for t in event_tags if t.name == 'h4'), None)

            # If not found, look for first <p> sibling where all children are <b>.
            if not event_summary:
                event_summary = next(
                    (t.string
                     for t in event_tags if t.name == 'p' and t.string and all(
                         c.name == 'b' for c in t.children)), None)

            # Find first <p> sibling where all children are not <b>.
            event_location = next(
                (', '.join(t.stripped_strings) for t in event_tags
                 if t.name == 'p' and all(c.name != 'b' for c in t.children)),
                None)

            # Find first <ul> sibling and capture entire html.
            event_description = next((utils.clean_html(str(t))
                                      for t in event_tags if t.name == 'ul'),
                                     None)

            yield CalEventItem(date=event_date,
                               summary=event_summary,
                               location=event_location,
                               description=event_description)
コード例 #9
0
    def parse(self, response):
        soup = BeautifulSoup(response.text, 'lxml')
        list_items = soup.select('.about-description li')
        for li in list_items:
            title_elm = li.select_one('b')

            summary = ''.join(title_elm.stripped_strings)
            datetime_text = ''.join(
                title_elm.find_previous_siblings(text=True)).strip()
            location = ''.join(title_elm.find_next_siblings(text=True)).strip()

            if '-' in datetime_text:
                datetime_text = datetime_text.split('-')[0].strip()

            event_dt = datetime.strptime(
                datetime_text,
                '%A, %B %d, %I%p').replace(year=datetime.now().year)

            yield CalEventItem(date=event_dt,
                               summary=summary,
                               description=None,
                               location=location)
コード例 #10
0
def parse_calendarjs(response):
    def parse_text(text) -> (str, str, str):
        event_time = None
        soup = BeautifulSoup(text, 'html.parser')
        for token in soup.stripped_strings:
            result, flag = CAL.parse(token)
            if flag == 2:
                event_time = datetime.fromtimestamp(mktime(result)).time()
                break
        return event_time, ' '.join(soup.stripped_strings)

    def parse_date(text) -> date:
        return datetime.strptime(text, '%m/%d/%Y').date()  # 2/8/2017

    for line in response.text.splitlines():
        if line.startswith('calEvents[calEvents.length]'):
            js_str = line.split(' = ')[1].lstrip()
            js_str = bytes(js_str, 'utf-8').decode("unicode_escape")
            js_str = re.sub(r'[\'"];?$', '', js_str)
            js_str = re.sub(r'^[\'"]', '', js_str)

            parts = js_str.split('|')

            event_date = parse_date(parts[0])
            event_time, event_summary = parse_text(parts[1])

            if event_time:
                event_dt = timezone('US/Eastern').localize(datetime.combine(event_date, event_time))
            else:
                event_dt = event_date

            yield CalEventItem(
                date=event_dt,
                summary=event_summary,
                location='',
                description=''
            )