def fetch(pid): ''' Fetch single paper from The American Presidency Project website (http://www.presidency.ucsb.edu/) and return as standard paper/article dict ''' url = base_url + '/ws/index.php?pid=' + pid soup = get_soup(url) # the HTML they generate is awkward, to say the least author, title = soup.find('title').get_text().split(': ', 1) date_string = soup.find('span', class_='docdate').string date = datetime.strptime(date_string, '%B %d, %Y') timestamp = date.date().isoformat() displaytext = soup.find('span', class_='displaytext') text = '\n'.join(iter_lines(displaytext)) paper = dict(author=author, title=title.strip('.'), timestamp=timestamp, source=url, text=text) displaynotes = soup.find('span', class_='displaynotes') note = displaynotes.get_text(' ') or None if note: # remove "Note: " prefix if present paper['note'] = re.sub(r'^note:\s+', '', note, flags=re.I) return paper
def fetch_election(year): ''' Fetch all papers related to an election campaign; year should be one of: 2016, 2012, 2008, 2004, 1960 ''' year_html = get_html(base_url + '/' + year + '_election.php') if year == '2008': # fix weird issue in Fred Thompson's entry year_html = year_html.replace( 'Status: withdrew on <span class="docdate">', 'Status: <span class="docdate">withdrew on ') soup = BeautifulSoup(year_html) container = soup.find('td', class_='doctext').find_parent('table') for td in container.find_all('td', class_='doctext'): paragraphs = td.find_all('p') if len(paragraphs) > 0: info_paragraph, links_paragraph = paragraphs candidate = _get_candidate_info(info_paragraph) for category, category_url in _iter_candidate_categories( links_paragraph): logger.info('Fetching papers from category "%s"', category) category_soup = get_soup(category_url) category_pids = _get_pids(category_soup) for pid in category_pids: paper = fetch(pid) if candidate['name'] != paper['author']: logger.warn( 'candidate name "%s" does not match paper author "%s" (%s)', candidate['name'], paper['author'], pid) paper['category'] = category yield paper
def fetch_transition(year): ''' Fetch all papers related to a presidential transition; year should be one of: 2017, 2009, 2001 ''' soup = get_soup(base_url + '/transition' + year + '.php') for pid in _get_pids(soup): paper = fetch(pid) yield paper
def _iter_speeches(): soup = get_soup(base_url + '/president/speeches') current_author = None for child in soup.find(id='listing').children: if child.name == 'h2': current_author = child.contents[0].strip() elif child.name == 'div': for anchor in child.select('.title a'): title, date = _split_title(anchor.text) yield current_author, title, date, anchor['href']
def fetch(page_url): ''' Given a full ABC News url, or just the partial path, fetch the page and return a standard speech dict ''' url = page_url if not url.startswith(base_url): url = base_url + '/' + url.lstrip('/') soup = get_soup(url) timestamp_string = soup.find(class_='timestamp').get_text() return { 'source': url, 'timestamp': parse_date(timestamp_string).isoformat(), 'text': '\n'.join(_iter_article_paragraphs(soup)), }
def _fetch_page(url): soup = get_soup(url) # heading_title = soup.find(class_='heading-title') # heading_subtitle = soup.find(class_='heading-subtitle') press_article_date = soup.find(class_='press-article-date') date = press_article_date.get_text() title = soup.find(class_='pane-node-title') content = soup.find(id='content-start') # body = content.select_one('.forall-body.field-type-text-long') bodies = content.select('.forall-body') text = u'\n'.join(iter_lines(*bodies)) return { 'source': url, 'timestamp': parse_date(date).isoformat(), 'text': text, }
def fetch(program_id): ''' Scrape C-SPAN transcript from https://www.c-span.org/ by Program ID Returns a standard paper/article dict ''' url = 'https://www.c-span.org/video/?' + program_id soup = get_soup(url) dl = soup.find(id='more-information').find(class_='details').find('dl') details = {k.strip(':'): v for k, v in iter_datalist_pairs(dl)} first_aired_date = ''.join(details['First Aired'].split('|')[:-1]) return { 'source': url, 'text': '\n'.join(_fetch_transcript_paragraphs(program_id)), 'timestamp': parse_date(first_aired_date).isoformat(), 'location': details['Location'], 'category': details['Format'].lower(), }
def fetch_inaugurals(): ordinals = ['Zeroth', 'First', 'Second', 'Third', 'Fourth'] soup = get_soup(base_url + '/inaugurals.php') pids = _get_pids(soup) # TAPP doesn't title (number) each inaugural distinctly authors = dict() for pid in pids: paper = fetch(pid) author = paper['author'] nth = authors[author] = authors.get(author, 0) + 1 # TAPP does not use consistent titles; e.g., # Richard Nixon gets "Oath of Office and Second Inaugural Address" # Lyndon B. Johnson gets "The President's Inaugural Address" # So we generate titles consistent with Miller Center's titles title = 'Inaugural Address' if nth > 1: title = ordinals[nth] + ' ' + title paper['title'] = title yield paper
def _iter_group_pages(url): ''' Iterate over (title, url) pairs for a given page (usually called with a root briefing-room group page url) ''' soup = get_soup(url) view = soup.find(class_='view') rows = view.find_all(class_='views-row') # list the rows on this page for row in rows: a = row.find('a') page_title = a.get_text() page_url = urljoin(base_url, a['href']) yield page_title, page_url # recurse into the next page for a in view.find(class_='pager').find_all('a'): if a.get_text() == 'Next': next_url = urljoin(base_url, a['href']) for page_title, page_url in _iter_group_pages(next_url): yield page_title, page_url
def _fetch_transcript_paragraphs(program_id): url = 'https://www.c-span.org/video/?' + program_id + '&action=getTranscript&transcriptType=cc' soup = get_soup(url) for paragraph in soup.find_all('p'): text = ''.join(filter(not_empty, map(strip, _iter_transcript_paragraph_strings(paragraph)))) yield ' '.join(text.strip().split())
def fetch_listing(params): soup = get_soup(base_url + '/ws/index.php', params=params) for pid in _get_pids(soup): paper = fetch(pid) yield paper