Esempio n. 1
0
def fetch(pid):
    '''
    Fetch single paper from The American Presidency Project website (http://www.presidency.ucsb.edu/)
    and return as standard paper/article dict
    '''
    url = base_url + '/ws/index.php?pid=' + pid
    soup = get_soup(url)
    # the HTML they generate is awkward, to say the least
    author, title = soup.find('title').get_text().split(': ', 1)
    date_string = soup.find('span', class_='docdate').string
    date = datetime.strptime(date_string, '%B %d, %Y')
    timestamp = date.date().isoformat()

    displaytext = soup.find('span', class_='displaytext')
    text = '\n'.join(iter_lines(displaytext))

    paper = dict(author=author,
                 title=title.strip('.'),
                 timestamp=timestamp,
                 source=url,
                 text=text)

    displaynotes = soup.find('span', class_='displaynotes')
    note = displaynotes.get_text(' ') or None
    if note:
        # remove "Note: " prefix if present
        paper['note'] = re.sub(r'^note:\s+', '', note, flags=re.I)

    return paper
Esempio n. 2
0
def fetch_election(year):
    '''
    Fetch all papers related to an election campaign; year should be one of:
    2016, 2012, 2008, 2004, 1960
    '''
    year_html = get_html(base_url + '/' + year + '_election.php')
    if year == '2008':
        # fix weird issue in Fred Thompson's entry
        year_html = year_html.replace(
            'Status: withdrew on <span class="docdate">',
            'Status: <span class="docdate">withdrew on ')
    soup = BeautifulSoup(year_html)
    container = soup.find('td', class_='doctext').find_parent('table')
    for td in container.find_all('td', class_='doctext'):
        paragraphs = td.find_all('p')
        if len(paragraphs) > 0:
            info_paragraph, links_paragraph = paragraphs
            candidate = _get_candidate_info(info_paragraph)
            for category, category_url in _iter_candidate_categories(
                    links_paragraph):
                logger.info('Fetching papers from category "%s"', category)
                category_soup = get_soup(category_url)
                category_pids = _get_pids(category_soup)
                for pid in category_pids:
                    paper = fetch(pid)
                    if candidate['name'] != paper['author']:
                        logger.warn(
                            'candidate name "%s" does not match paper author "%s" (%s)',
                            candidate['name'], paper['author'], pid)
                    paper['category'] = category
                    yield paper
Esempio n. 3
0
def fetch_transition(year):
    '''
    Fetch all papers related to a presidential transition; year should be one of:
    2017, 2009, 2001
    '''
    soup = get_soup(base_url + '/transition' + year + '.php')
    for pid in _get_pids(soup):
        paper = fetch(pid)
        yield paper
Esempio n. 4
0
def _iter_speeches():
    soup = get_soup(base_url + '/president/speeches')
    current_author = None
    for child in soup.find(id='listing').children:
        if child.name == 'h2':
            current_author = child.contents[0].strip()
        elif child.name == 'div':
            for anchor in child.select('.title a'):
                title, date = _split_title(anchor.text)
                yield current_author, title, date, anchor['href']
Esempio n. 5
0
def fetch(page_url):
    '''
    Given a full ABC News url, or just the partial path, fetch the page and
    return a standard speech dict
    '''
    url = page_url
    if not url.startswith(base_url):
        url = base_url + '/' + url.lstrip('/')
    soup = get_soup(url)
    timestamp_string = soup.find(class_='timestamp').get_text()
    return {
        'source': url,
        'timestamp': parse_date(timestamp_string).isoformat(),
        'text': '\n'.join(_iter_article_paragraphs(soup)),
    }
Esempio n. 6
0
def _fetch_page(url):
    soup = get_soup(url)
    # heading_title = soup.find(class_='heading-title')
    # heading_subtitle = soup.find(class_='heading-subtitle')
    press_article_date = soup.find(class_='press-article-date')
    date = press_article_date.get_text()
    title = soup.find(class_='pane-node-title')
    content = soup.find(id='content-start')
    # body = content.select_one('.forall-body.field-type-text-long')
    bodies = content.select('.forall-body')
    text = u'\n'.join(iter_lines(*bodies))
    return {
        'source': url,
        'timestamp': parse_date(date).isoformat(),
        'text': text,
    }
Esempio n. 7
0
def fetch(program_id):
    '''
    Scrape C-SPAN transcript from https://www.c-span.org/ by Program ID

    Returns a standard paper/article dict
    '''
    url = 'https://www.c-span.org/video/?' + program_id
    soup = get_soup(url)
    dl = soup.find(id='more-information').find(class_='details').find('dl')
    details = {k.strip(':'): v for k, v in iter_datalist_pairs(dl)}
    first_aired_date = ''.join(details['First Aired'].split('|')[:-1])
    return {
        'source': url,
        'text': '\n'.join(_fetch_transcript_paragraphs(program_id)),
        'timestamp': parse_date(first_aired_date).isoformat(),
        'location': details['Location'],
        'category': details['Format'].lower(),
    }
Esempio n. 8
0
def fetch_inaugurals():
    ordinals = ['Zeroth', 'First', 'Second', 'Third', 'Fourth']
    soup = get_soup(base_url + '/inaugurals.php')
    pids = _get_pids(soup)
    # TAPP doesn't title (number) each inaugural distinctly
    authors = dict()
    for pid in pids:
        paper = fetch(pid)
        author = paper['author']
        nth = authors[author] = authors.get(author, 0) + 1
        # TAPP does not use consistent titles; e.g.,
        #   Richard Nixon gets "Oath of Office and Second Inaugural Address"
        #   Lyndon B. Johnson gets "The President's Inaugural Address"
        # So we generate titles consistent with Miller Center's titles
        title = 'Inaugural Address'
        if nth > 1:
            title = ordinals[nth] + ' ' + title
        paper['title'] = title
        yield paper
Esempio n. 9
0
def _iter_group_pages(url):
    '''
    Iterate over (title, url) pairs for a given page (usually called with a root
    briefing-room group page url)
    '''
    soup = get_soup(url)
    view = soup.find(class_='view')
    rows = view.find_all(class_='views-row')
    # list the rows on this page
    for row in rows:
        a = row.find('a')
        page_title = a.get_text()
        page_url = urljoin(base_url, a['href'])
        yield page_title, page_url
    # recurse into the next page
    for a in view.find(class_='pager').find_all('a'):
        if a.get_text() == 'Next':
            next_url = urljoin(base_url, a['href'])
            for page_title, page_url in _iter_group_pages(next_url):
                yield page_title, page_url
Esempio n. 10
0
def _fetch_transcript_paragraphs(program_id):
    url = 'https://www.c-span.org/video/?' + program_id + '&action=getTranscript&transcriptType=cc'
    soup = get_soup(url)
    for paragraph in soup.find_all('p'):
        text = ''.join(filter(not_empty, map(strip, _iter_transcript_paragraph_strings(paragraph))))
        yield ' '.join(text.strip().split())
Esempio n. 11
0
def fetch_listing(params):
    soup = get_soup(base_url + '/ws/index.php', params=params)
    for pid in _get_pids(soup):
        paper = fetch(pid)
        yield paper