def get_divisions(): """Scrape the list of divisions and major groups from the OSHA website Divisions are the broadest grouping of SIC codes provided by OSHA Major groups are the second broadest grouping of SIC codes provided by OSHA """ # Read site soup = get_soup(config.OSHA_base_url + 'sic_manual.html') # Find content container = soup.select('div#maincontain')[0] master_list = container.find('div').find('ol') all_links = master_list.find_all('a') # Store cleaned descriptions, from aref elements divisions = [] for i in range(0, len(all_links)): # Store full desciption provided by site and keep the associated link l = all_links[i] full_desc = str(l.contents[0]).strip().encode("utf-8") link = l.get('href').encode("utf-8") # Get the description of the parent group if (i > 0) & (clean_desc(full_desc)[1] == 'Major Group'): parent_desc = get_parent(divisions, i, 'Major Group', 'Division') else: parent_desc = str(None) # Add to running list of named tuples divisions.append(ind_group(full_desc, parent_desc, link)) return divisions
def explore_book(allbook, url, conn, cursor): sleep(1) soup = get_soup(url) book = get_book(soup, url) print(book.name, book.score, book.url) insert_explored_book(book, conn, cursor) if is_target_book(allbook, book): update_allbook(allbook, book, conn, cursor) urls = urls_for_more_books(soup) for url in urls: if not_explored(url, cursor): explore_book(allbook, url, conn, cursor)
def get_major(url_ext): """Scrape the list of major groups, industry groups and SIC four-digit SIC codes from the OSHA website Major groups are the second broadest grouping of SIC codes provided by OSHA Industry groups are the third broadest grouping (least granular) of SIC codes provided by OSHA """ # Read site soup = get_soup(config.OSHA_base_url + url_ext) # Isolate relevant content container = soup.select('div#maincontain')[0] groups = container.find_all(['strong', 'li']) major_desc = str(container.find_all('h2')[0].contents[0]) # Store cleaned descriptions, from strong and li elements majors = [] for i in range(0, len(groups)): g = groups[i] # Get description of SIC and industry groups if g.name == 'strong': # Get industry group descriptions full_desc = g.contents[0].strip().encode("utf-8") link = None elif g.name == 'li': # Get four-digit SIC code descriptions full_desc = 'SIC4 ' + str(g.contents[0]).strip() + \ ': ' + str(g.contents[1].contents[0]).strip() link = g.contents[1].get('href').encode("utf-8") else: # Otherwise raise a value error raise ValueError('Unexpected element type: ' + g.name) # Get the description of the parent group if (i > 0) & (clean_desc(full_desc)[1] == 'SIC4'): parent_desc = get_parent(majors, i, 'SIC4', 'Industry Group') else: parent_desc = major_desc # Add to running list of named tuples majors.append(ind_group(full_desc, parent_desc, link)) return majors
def get_youtube_info(url): if 'channel' in url: tag = 'channel' elif 'playlist' in url: tag = 'playlist' elif 'user' in url: tag = 'user' else: return {} sep = '=' if tag == 'playlist' else '/' id_ = url.split(sep)[-1] prefix = 'https://www.youtube.com/feeds/videos.xml?' postfix = '_id=' if tag != 'user' else '=' xml = prefix + tag + postfix + id_ soup = get_soup(xml) name = soup.find('title').text return { 'name': name, 'tag': tag, 'rss': xml }
def get_sic_sec(): """Scrape SIC codes from SEC website """ # Setup soup = get_soup(config.SEC_base_url) table = soup.find_all('table')[3] # Convert HTML to nested list data = [] for row in table.find_all('tr'): cols = row.find_all('td') cols = [ele.text.strip().replace(' ', ' ') for ele in cols] if (len(cols) > 1): data.append([ele.encode('utf-8') for ele in cols if ele]) # Clean headers if data[0] != config.SEC_expected_columns: warnings.warn('Warning: column names have changed in URL ' + config.SEC_base_url) data[0] = config.SEC_columns return data
import soup as s url = "https://www.nytimes.com/" if __name__ == "__main__": print( "Note:\n" "\tThe html is now very strange " "and there is no sense to check whether this is 100% accurate.\n" "\tThe structure and tags will be different in a couple of years anyway.\n\n" ) soup = s.get_soup(url) print(*s.get_all_tags(soup, 'span'), sep="\n") print(*s.get_all_tags(soup, 'h2'), sep="\n")
def __init__(self, url, last_date, item_name='item'): self.soup = get_soup(url) self.item = item_name self.last_date = last_date