def scrape_document_text(link): ''' This function takes in a link to the page with text for speech/statement/press and returns just the relevant text ''' soup = make_soup(link) text = soup.find("span", {"class":"displaytext"}).text return text
def scrape_document_text(link): ''' This function takes in a link to the page with text for speech/statement/press and returns just the relevant text ''' soup = make_soup(link) text = soup.find("span", {"class": "displaytext"}).text return text
def get_page_text(link): ''' Given a link to a senate doc, returns relevant text on that get_page_text ''' soup = custom_utilities.make_soup(link) text = soup.find("pre", {"class": "styled"}).text text = re.sub('<[^>]+>', ' ', text) text = re.sub("[\[].*?[\]]", " ", text) return text
def get_page_text(link): ''' Given a link to a senate doc, returns relevant text on that get_page_text ''' soup = custom_utilities.make_soup(link) text = soup.find("pre", {"class":"styled"}).text text = re.sub('<[^>]+>', ' ', text) text = re.sub("[\[].*?[\]]", " ", text) return text
def get_text_of_individual_documents(link): ''' Takes in link to the page with a table of |candidate -- date -- link| and returns a dictionary of {date: text of link} ''' rv = {} soup = make_soup(link) # Find the table of links to individual documents and pull out the rows cand_table = soup.findAll("table", {"width":"700", "border":"0", "align":"center"}) rows = cand_table[0].findAll("tr") rows = rows[1:] # For each row, find each column. Then scrape the text for the link in that row # and store in return dictionary for row in rows: elems = row.findAll("td") link = "http://www.presidency.ucsb.edu" + elems[2].findAll("a", href=True)[0]['href'][2:] rv[elems[1].text] = scrape_document_text(link) return rv
def collect_suitable_links_for_day(link): ''' Given a link to a day when there were senate proceedings, returns all links that should be followed and scraped ''' links = {} soup = custom_utilities.make_soup(link) main_table = soup.findAll("table", {"class":"item_table"}) if len(main_table)==0: # this happens when no senate activity on date or invalid date return [] # Find all columns in the table tds = main_table[0].findAll("td") # For each column, grab the link for td in tds: a = td.findAll("a") if should_follow_link(a[0].text): links[a[0].text] = {} links[a[0].text]['link'] = a[0]['href'] return links
def get_links_to_documents_pages(link): ''' Given the main link, traverses page and gets all links to the pages with statements, etc. on them. Stores in dictionary by candidate name ''' rv = {} # Get soup for the main page and parse it into the rows soup = make_soup(link) rows = soup.findAll('td', {'class': 'doctext'}) # Loop over rows for row in rows: elems = row.findAll("span") # if no span elements, is not a row with links if len(elems) != 0: candidate = elems[0].text rv[candidate] = {} links = row.findAll("a") for l in links: rv[candidate][l.text] = "http://www.presidency.ucsb.edu/" + l['href'] return rv
def get_links_to_documents_pages(link): ''' Given the main link, traverses page and gets all links to the pages with statements, etc. on them. Stores in dictionary by candidate name ''' rv = {} # Get soup for the main page and parse it into the rows soup = make_soup(link) rows = soup.findAll('td', {'class': 'doctext'}) # Loop over rows for row in rows: elems = row.findAll("span") # if no span elements, is not a row with links if len(elems) != 0: candidate = elems[0].text rv[candidate] = {} links = row.findAll("a") for l in links: rv[candidate][ l.text] = "http://www.presidency.ucsb.edu/" + l['href'] return rv
def get_text_of_individual_documents(link): ''' Takes in link to the page with a table of |candidate -- date -- link| and returns a dictionary of {date: text of link} ''' rv = {} soup = make_soup(link) # Find the table of links to individual documents and pull out the rows cand_table = soup.findAll("table", { "width": "700", "border": "0", "align": "center" }) rows = cand_table[0].findAll("tr") rows = rows[1:] # For each row, find each column. Then scrape the text for the link in that row # and store in return dictionary for row in rows: elems = row.findAll("td") link = "http://www.presidency.ucsb.edu" + elems[2].findAll( "a", href=True)[0]['href'][2:] rv[elems[1].text] = scrape_document_text(link) return rv