def parse(self, response): soup = BeautifulSoup(response.text, 'lxml') depth = response.meta['depth'] or 0 event_elms = soup.select('#Events div.event') for elm in event_elms: event_summary = ' '.join(elm.select_one('h2').stripped_strings) info = list(elm.select_one('.info').stripped_strings) event_dt = self.__parse_dt(info[0]) event_location = '\n'.join(info[1:]) event_desc = '\n'.join( (utils.clean_html(str(e)) for e in elm.select_one('.info').find_next_siblings() if e.attrs.get('class') != ['up'])) yield CalEventItem(date=event_dt, summary=event_summary, description=event_desc, location=event_location) if depth == 0: link = soup.select_one('.peernav .next a') request = scrapy.Request(link['href']) request.meta['depth'] = depth + 1 yield request
def parse_event(self, response): soup = BeautifulSoup(response.text, 'lxml') elems = soup.select('.meeting_wrap p') data = { key.string.rstrip(':').strip(): val.string.strip() for key, val in (elem.children for elem in elems) } date_str = data['Date'] time_str = data['Time'] venue = data['Venue'] address = data['Address'] if not date_str or not time_str or time_str == 'CANCELLED': return date_str = utils.strip_date_ords(date_str) time_str = time_str.replace('.', '') event_date = datetime.strptime(date_str, '%B %d, %Y').date() event_time = self.__parse_time(time_str) event_dt = timezone('US/Eastern').localize( datetime.combine(event_date, event_time)) event_summary = soup.select('.et_main_title')[0].text.strip() event_description = response.url event_location = '\n'.join([x for x in (venue, address) if x]) yield CalEventItem(date=event_dt, summary=event_summary, description=event_description, location=event_location)
def parse(self, response): soup = BeautifulSoup(response.text, 'lxml') title_elms = soup.select('.about-description h2, .about-description h3') for elm in title_elms: siblings = list(itertools.islice( filter( lambda x: x.encode().strip() and x.name != 'br', elm.next_siblings), 3)) # Ensure next 3 elements (ignoring whitespace/<br>s) are text. if len(siblings) != 3 or not all([x.name is None for x in siblings]): continue summary_text = elm.text date_text = str(siblings[0]).strip() time_text = str(siblings[1]).strip() location_text = str(siblings[2]).strip() agenda_elm = elm.find_next_sibling('a') agenda_href = urljoin(self.start_urls[0], agenda_elm.attrs['href']) agenda_text = agenda_elm.text description_text = f'<a href="{agenda_href}>{agenda_text}</a>' event_date = self.__parse_date(date_text) event_time = self.__parse_time(time_text) event_dt = datetime.combine(event_date, event_time) yield CalEventItem( date=event_dt, summary=summary_text, description=description_text, location=location_text )
def parse(self, response): cal = Calendar.from_ical(response.body_as_unicode()) for vevent in cal.subcomponents: event_dt = timezone('US/Eastern').localize( vevent.get('DTSTART').dt) event_id = vevent.get('UID') event_summary = vevent.get('SUMMARY') event_location = vevent.get('LOCATION') event_url = vevent.get('URL') yield CalEventItem(id=event_id, date=event_dt, summary=event_summary, description=event_url, location=event_location)
def parse_event_ical(self, response): cal = Calendar.from_ical(response.body_as_unicode()) for vevent in cal.subcomponents: event_dt = vevent.get('DTSTART').dt event_summary = vevent.get('SUMMARY') event_description = vevent.get('DESCRIPTION') event_location = response.meta.get('location', None) if event_summary == 'Alternate Side Parking Rules Suspended': continue yield CalEventItem(date=event_dt, summary=event_summary, description=event_description, location=event_location)
def parse(self, response): soup = BeautifulSoup(response.text, 'lxml') tag = soup.select('.highlight_bodytext')[0] # type: Tag lines = [] current = "" for child in tag.descendants: # type: Tag if isinstance(child, NavigableString): current += child.string.strip('\n').replace(u'\xa0', ' ') if child.name == 'br': if current: lines.append(current) current = "" if current: lines.append(current) lines = [line.strip() for line in lines if line.strip()] current_month = None current_day = None current_year = None text_buffer = [] for index, text in enumerate(lines): tokens = re.split(r'\W+', text) is_date = tokens[0] in DAY_NAMES and tokens[1] in MONTH_NAMES and tokens[2].isdigit() is_year = tokens[0] in MONTH_NAMES and tokens[1].isdigit() is_last = index == len(lines) - 1 if is_date: current_month = MONTH_NAMES.index(tokens[1]) + 1 current_day = int(tokens[2]) elif is_year: current_year = int(tokens[1]) else: text_buffer.append(text) if (is_date or is_year or is_last) and text_buffer: summary = text_buffer[0] location = text_buffer[1] if len(text_buffer) > 1 else None yield CalEventItem( date=datetime(year=current_year, month=current_month, day=current_day), summary=summary, description=None, location=location ) text_buffer = []
def parse(self, response): pdf = json.loads(response.body_as_unicode()) data = pdf['pages'][0]['tables'][0]['data'] for event in data: committee = event['COMMITTEE'] chair = event['CHAIR/VICE-CHAIR/CO-CHAIR'] time = event['TIME'] date = event['DATE'] event_time = datetime.strptime(time, '%I:%M %p').time() event_date = datetime.strptime(date, '%d-%b-%y').date() event_dt = datetime.combine(event_date, event_time) yield CalEventItem( date=event_dt, summary=committee, description=chair, location=None )
def parse(self, response): soup = BeautifulSoup(response.text, 'lxml') for tag in soup.select('.about-description > h3'): # Find index of next event header. all_siblings = tag.select('~ *') next_header = tag.select_one('~ hr, ~ h3, ~ h2') # Find all siblings up to next header (or end of document if last event). event_tags = tag.select('~ *', limit=all_siblings.index(next_header)) \ if next_header else all_siblings # Use <h3> text as date. event_date = self.__parse_date(tag.string) # Find first <h4> sibling. event_summary = next( (t.string for t in event_tags if t.name == 'h4'), None) # If not found, look for first <p> sibling where all children are <b>. if not event_summary: event_summary = next( (t.string for t in event_tags if t.name == 'p' and t.string and all( c.name == 'b' for c in t.children)), None) # Find first <p> sibling where all children are not <b>. event_location = next( (', '.join(t.stripped_strings) for t in event_tags if t.name == 'p' and all(c.name != 'b' for c in t.children)), None) # Find first <ul> sibling and capture entire html. event_description = next((utils.clean_html(str(t)) for t in event_tags if t.name == 'ul'), None) yield CalEventItem(date=event_date, summary=event_summary, location=event_location, description=event_description)
def parse(self, response): soup = BeautifulSoup(response.text, 'lxml') list_items = soup.select('.about-description li') for li in list_items: title_elm = li.select_one('b') summary = ''.join(title_elm.stripped_strings) datetime_text = ''.join( title_elm.find_previous_siblings(text=True)).strip() location = ''.join(title_elm.find_next_siblings(text=True)).strip() if '-' in datetime_text: datetime_text = datetime_text.split('-')[0].strip() event_dt = datetime.strptime( datetime_text, '%A, %B %d, %I%p').replace(year=datetime.now().year) yield CalEventItem(date=event_dt, summary=summary, description=None, location=location)
def parse_calendarjs(response): def parse_text(text) -> (str, str, str): event_time = None soup = BeautifulSoup(text, 'html.parser') for token in soup.stripped_strings: result, flag = CAL.parse(token) if flag == 2: event_time = datetime.fromtimestamp(mktime(result)).time() break return event_time, ' '.join(soup.stripped_strings) def parse_date(text) -> date: return datetime.strptime(text, '%m/%d/%Y').date() # 2/8/2017 for line in response.text.splitlines(): if line.startswith('calEvents[calEvents.length]'): js_str = line.split(' = ')[1].lstrip() js_str = bytes(js_str, 'utf-8').decode("unicode_escape") js_str = re.sub(r'[\'"];?$', '', js_str) js_str = re.sub(r'^[\'"]', '', js_str) parts = js_str.split('|') event_date = parse_date(parts[0]) event_time, event_summary = parse_text(parts[1]) if event_time: event_dt = timezone('US/Eastern').localize(datetime.combine(event_date, event_time)) else: event_dt = event_date yield CalEventItem( date=event_dt, summary=event_summary, location='', description='' )