Python make_soup Examples, custom_utilities.make_soup Python Examples

Example #1

0

Show file

File: scrape_other_election_documents.py Project: tgg-c-cook/political_polarization

def scrape_document_text(link):
    '''
    This function takes in a link to the page with text for speech/statement/press and returns
    just the relevant text
    '''
    soup = make_soup(link)
    text = soup.find("span", {"class":"displaytext"}).text
    return text

Example #2

0

Show file

def scrape_document_text(link):
    '''
    This function takes in a link to the page with text for speech/statement/press and returns
    just the relevant text
    '''
    soup = make_soup(link)
    text = soup.find("span", {"class": "displaytext"}).text
    return text

Example #3

0

Show file

File: thomasLOC_utilities.py Project: codyfcook/political_polarization

def get_page_text(link):
    '''
    Given a link to a senate doc, returns relevant text on that get_page_text
    '''
    soup = custom_utilities.make_soup(link)
    text = soup.find("pre", {"class": "styled"}).text
    text = re.sub('<[^>]+>', ' ', text)
    text = re.sub("[\[].*?[\]]", " ", text)
    return text

Example #4

0

Show file

File: thomasLOC_utilities.py Project: tgg-c-cook/political_polarization

def get_page_text(link):
    '''
    Given a link to a senate doc, returns relevant text on that get_page_text
    '''
    soup = custom_utilities.make_soup(link)
    text = soup.find("pre", {"class":"styled"}).text
    text = re.sub('<[^>]+>', ' ', text)
    text = re.sub("[\[].*?[\]]", " ", text)
    return text

Example #5

0

Show file

File: scrape_other_election_documents.py Project: tgg-c-cook/political_polarization

def get_text_of_individual_documents(link):
    '''
    Takes in link to the page with a table of |candidate -- date -- link| and returns a
    dictionary of {date: text of link}
    '''
    rv = {}
    soup = make_soup(link)
    # Find the table of links to individual documents and pull out the rows
    cand_table = soup.findAll("table", {"width":"700", "border":"0", "align":"center"})
    rows = cand_table[0].findAll("tr")
    rows = rows[1:]
    # For each row, find each column. Then scrape the text for the link in that row
    # and store in return dictionary
    for row in rows:
        elems = row.findAll("td")
        link = "http://www.presidency.ucsb.edu" + elems[2].findAll("a", href=True)[0]['href'][2:]
        rv[elems[1].text] = scrape_document_text(link)
    return rv

Example #6

0

Show file

File: thomasLOC_utilities.py Project: tgg-c-cook/political_polarization

def collect_suitable_links_for_day(link):
    '''
    Given a link to a day when there were senate proceedings, returns all links that should
    be followed and scraped
    '''
    links = {}
    soup = custom_utilities.make_soup(link)
    main_table = soup.findAll("table", {"class":"item_table"})
    if len(main_table)==0: # this happens when no senate activity on date or invalid date
        return []
    # Find all columns in the table
    tds = main_table[0].findAll("td")
    # For each column, grab the link
    for td in tds:
        a = td.findAll("a")
        if should_follow_link(a[0].text):
            links[a[0].text] = {}
            links[a[0].text]['link'] = a[0]['href']
    return links

Example #7

0

Show file

def collect_suitable_links_for_day(link):
    '''
    Given a link to a day when there were senate proceedings, returns all links that should
    be followed and scraped
    '''
    links = {}
    soup = custom_utilities.make_soup(link)
    main_table = soup.findAll("table", {"class":"item_table"})
    if len(main_table)==0: # this happens when no senate activity on date or invalid date
        return []
    # Find all columns in the table
    tds = main_table[0].findAll("td")
    # For each column, grab the link
    for td in tds:
        a = td.findAll("a")
        if should_follow_link(a[0].text):
            links[a[0].text] = {}
            links[a[0].text]['link'] = a[0]['href']
    return links

Example #8

0

Show file

File: scrape_other_election_documents.py Project: tgg-c-cook/political_polarization

def get_links_to_documents_pages(link):
    '''
    Given the main link, traverses page and gets all links to the pages with
    statements, etc. on them. Stores in dictionary by candidate name
    '''
    rv = {}

    # Get soup for the main page and parse it into the rows
    soup = make_soup(link)
    rows = soup.findAll('td', {'class': 'doctext'})
    # Loop over rows
    for row in rows:
        elems = row.findAll("span")
        # if no span elements, is not a row with links
        if len(elems) != 0:
            candidate = elems[0].text
            rv[candidate] = {}
            links = row.findAll("a")
            for l in links:
                rv[candidate][l.text] = "http://www.presidency.ucsb.edu/" + l['href']
    return rv

Example #9

0

Show file

def get_links_to_documents_pages(link):
    '''
    Given the main link, traverses page and gets all links to the pages with
    statements, etc. on them. Stores in dictionary by candidate name
    '''
    rv = {}

    # Get soup for the main page and parse it into the rows
    soup = make_soup(link)
    rows = soup.findAll('td', {'class': 'doctext'})
    # Loop over rows
    for row in rows:
        elems = row.findAll("span")
        # if no span elements, is not a row with links
        if len(elems) != 0:
            candidate = elems[0].text
            rv[candidate] = {}
            links = row.findAll("a")
            for l in links:
                rv[candidate][
                    l.text] = "http://www.presidency.ucsb.edu/" + l['href']
    return rv

Example #10

0

Show file

def get_text_of_individual_documents(link):
    '''
    Takes in link to the page with a table of |candidate -- date -- link| and returns a
    dictionary of {date: text of link}
    '''
    rv = {}
    soup = make_soup(link)
    # Find the table of links to individual documents and pull out the rows
    cand_table = soup.findAll("table", {
        "width": "700",
        "border": "0",
        "align": "center"
    })
    rows = cand_table[0].findAll("tr")
    rows = rows[1:]
    # For each row, find each column. Then scrape the text for the link in that row
    # and store in return dictionary
    for row in rows:
        elems = row.findAll("td")
        link = "http://www.presidency.ucsb.edu" + elems[2].findAll(
            "a", href=True)[0]['href'][2:]
        rv[elems[1].text] = scrape_document_text(link)
    return rv