def mini_crawler(url, page_parser_q, pull_info_q, links_visited, limiting_domain, index_list, parsing_default_domain): ''' Crawl the college catalog and adds to an index dictionary to map set of words with associated course identifier. Inputs: url: starting url to begin crawling with q: queue of urls in line to be crawled links_visited: list of visited links limiting_domain: domain name index_list: dictionary that maps words to course identifiers ''' if url in links_visited: return request = util.get_request(url) if request is None: return post_url = util.get_request_url(request) if post_url in links_visited: return html = util.read_request(request) soup = bs4.BeautifulSoup(html, features="html5lib") find_links(soup, url, post_url, pull_info_q, links_visited, limiting_domain) tag_list = soup.find_all("ul", attrs = {"class": "pagination"}) current_page = tag_list[0].find_all("li", attrs = {"class": "current"}) next_page = current_page[0].next_sibling.next_sibling.findChild() next_page_href = next_page.get('href') next_page_href = util.convert_if_relative_url(post_url, next_page_href) page_parser_q.put(next_page_href)
def get_restaurant_links_chicago(): # start from searching "Restaurant", "Chicago" from yelp main page page_suffix = [i for i in range(0, 231, 10)] url = 'https://www.yelp.com/search?find_desc=Restaurants&find_loc=Chicago%2C%20IL&start=' url_list = [] for suffix in page_suffix: page_url = url + str(suffix) url_list.append(page_url) links = [] for url in url_list: request = util.get_request(url) text = util.read_request(request) soup = bs4.BeautifulSoup(text, "html5lib") tags = soup.find_all('a', href=True, target="", role="") # extract href links to restaurants links = [] for tag in tags: link = tag['href'] link = util.convert_if_relative_url(url, link) link = util.remove_fragment(link) # Hardcoded filter if link[-11:] == "Restaurants": if tag["name"] != '': if link not in links: links.append(link) return links
def get_walk_score(zip_code): ''' Gets walk score for single zip code Input: zip_code (str or int): a US zip code Output: score (int): Walk score for that zip code. Missing values get -1. ''' url = "https://www.walkscore.com/score/" + str(zip_code) req = util.get_request(url) if req: text = util.read_request(req) else: score = -1 text = None if text: soup = bs4.BeautifulSoup(text, features='lxml') span = soup.find('span', attrs={'id': 'score-description-sentence'}) try: score_txt = span.text match = re.search("(Walk Score of)(\s)(\d+)(\s)", score_txt) score = int(match.group(3)) except AttributeError: score = -1 else: score = -1 return score
def queue_children_sites(starting_url, queue): '''Given a url and a queue, adds all children urls of the start point to the queue Inputs: starting_url -- string that corresponds to a url queue -- queue.Queue object Outputs: None, queue is modified in place to contain all child urls''' if starting_url[4] == 's': pass else: starting_url = starting_url[:4] + 's' + starting_url[4:] #turns http to https if not already request = util.get_request(starting_url) assert request != None text = util.read_request(request) soup = bs4.BeautifulSoup(text, "html5lib") URLs = soup.find_all("a") URLs = [URL["href"] for URL in URLs if URL.has_attr("href")] children = [] for URL in URLs: if util.is_absolute_url(URL): children.append(URL) else: URL = util.convert_if_relative_url(starting_url, URL) children.append(URL) children = [ child for child in children if util.is_url_ok_to_follow(child, limiting_domain) ] for child in children: queue.put(child)
def get_cities(): city_state = get_loc_cook() cook_cities = [] for ele in city_state: cook_cities.append(ele[0]) #print(cook_cities) #print(len(cook_cities)) url = 'https://www.rentcafe.com/sitemaps/us/il/average-rent-market-trends/' request = util.get_request(url) text = util.read_request(request) soup = bs4.BeautifulSoup(text, "html5lib") tags = soup.find_all('a', href=True, target="", role="") cities = [] count = 0 for tag in tags: if "title" in tag.attrs: city = tag['title'] if city[0:15] == "Average Rent in": #print(city) city = city[16:] #print(city) count += 1 if city in cook_cities: cities.append(city) #print(count) #print(len(cities)) return cities
def mini_crawler(url, page_parser_q, pull_info_q, links_visited, limiting_domain, index_list): ''' Crawl the college catalog and adds to an index list to map set of words with associated course identifier. Inputs: url: starting url to begin crawling with q: queue of urls in line to be crawled links_visited: list of visited links limiting_domain: domain name index_list: list of dictionaries for each webpage ''' if url in links_visited: return request = util.get_request(url) if request is None: return post_url = util.get_request_url(request) if post_url in links_visited: return html = util.read_request(request) soup = bs4.BeautifulSoup(html, features="html5lib") find_links(soup, url, post_url, pull_info_q, links_visited, limiting_domain) tag_list = soup.find_all("ul", attrs={"class": "Pagination"}) pages = tag_list[0].find_all("li") pages = pages[1:] for page in pages: page_parser_q.put(page.findChild().get('href'))
def crawler(): starting_url = "https://www.teenlife.com/search?q=&l=&c=Summer%20Program&p=1" limiting_domain = "www.teenlife.com" parsing_default_domain = "https://www.teenlife.com/search" numpages = 0 links_visited = [] index_list = [] page_parser_q = queue.Queue() pull_info_q = queue.Queue() page_parser_q.put(starting_url) while page_parser_q.empty() == False: link = page_parser_q.get() mini_crawler(link, page_parser_q, pull_info_q, links_visited, limiting_domain, index_list, parsing_default_domain) numpages += 1 print(link, "link") while pull_info_q.empty() == False: page_link = pull_info_q.get() print(page_link, "page_link") request = util.get_request(page_link) if request is not None: html = util.read_request(request) soup = bs4.BeautifulSoup(html, features="html5lib") make_index(soup, index_list) df = pd.DataFrame(index_list) return df
def crawler(): # starting_url = "https://www.teenlife.com/search/?q=None&l=None&c=Summer%20Program&p=1" starting_url = "https://rusticpathways.com/students/programs?_=1584132668586&page=1" limiting_domain = "rusticpathways.com" numpages = 0 links_visited = [] index_list = [] page_parser_q = queue.Queue() pull_info_q = queue.Queue() page_parser_q.put(starting_url) while page_parser_q.empty() == False: link = page_parser_q.get() mini_crawler(link, page_parser_q, pull_info_q, links_visited, limiting_domain, index_list) numpages += 1 while pull_info_q.empty() == False: page_link = pull_info_q.get() # print(page_link) request = util.get_request(page_link) if request is not None: html = util.read_request(request) soup = bs4.BeautifulSoup(html, features="html5lib") make_index(soup, index_list, page_link) # print(index_list) df = pd.DataFrame(index_list) return df
def go(housing_links): ''' Main function Inputs: housing_links (list): a list of links obtained from inputing different zipcodes to the search bar of rentcafe.com Output: d (dict): a dictionary mapping each zipcode to a tuple (mean_price, income) ''' # a dictionary with zipcode as keys, avg rent price as values d = {} # start from the first zip_code... for link in housing_links: zip_code = str(link[-5:]) d[zip_code] = [] request = util.get_request(link) text = util.read_request(request) soup = bs4.BeautifulSoup(text, "html5lib") # find median income under this zipcode li_tags = soup.find_all('li', class_="medium") income = np.int64(re.findall(r'\d+(?:,\d+)?', li_tags[2].text)[0].replace(',','')) # collect all subpages under this zipcode pages_to_crawl = [] tags = soup.find('ul', class_="pagination") if tags is None: pages_to_crawl = [link] else: pages = tags.find_all('a', href=True) for a in pages: if a['href'] not in pages_to_crawl: pages_to_crawl.append(a['href']) for url in pages_to_crawl: request = util.get_request(url) text = util.read_request(request) soup = bs4.BeautifulSoup(text, "html5lib") property_tags = soup.find_all('div', class_='item-information') for item in property_tags: d[zip_code].append(find_adj_price(item)) d[zip_code] = (np.mean([x for x in d[zip_code] if x != 0]), income) return d
def create_dictionary(num_pages_to_crawl, course_map_filename, starting_url, limiting_domain): ''' Creates the dictionary mapping course id numbers to the words in the course titles and descriptions. Inputs: num_pages_to_crawl: (int) The number of pages to process during the crawl. course_map_filename: (string) The name of the JSON file that contains the mapping of course codes to course ids. starting_url: (string) The url of the first page that the crawler visits. limiting_domain: (string) The limiting domain of the url. Outputs: The dictionary mapping course id numbers to the words in the course titles and descriptions. ''' with open(course_map_filename) as json_file: coursemap = json.load(json_file) url_list = [] url_queue = queue.Queue() num_pages = 0 course_dict = {} process_dict = {} starting_url = clean_url(starting_url, limiting_domain, parent_url=None) if starting_url: url_queue.put(starting_url) while num_pages < num_pages_to_crawl and not url_queue.empty(): num_pages += 1 next_url = url_queue.get() if next_url and next_url not in url_list: request = util.get_request(next_url) if request: request_url = util.get_request_url(request) if request_url and request_url not in url_list: url_list.append(next_url) if request_url not in url_list: url_list.append(request_url) html_text = util.read_request(request) soup = bs4.BeautifulSoup(html_text, "html5lib") process_dict.update(find_course_info(soup, coursemap,\ course_dict)) if process_dict: course_dict.update(process_dict) href_list = soup.find_all("a", href=True) for h in href_list: h_url = h['href'] h_url = clean_url(h_url, limiting_domain, request_url) if h_url: url_queue.put(h_url) return course_dict
def open_page(url): # print("opening...", url) r = util.get_request(url) # print("r:", r) if r: return r.url, BeautifulSoup(util.read_request(r)) else: return None, None
def build_search_engine(starting_url, limiting_domain, max_num_pages_to_visit): urls = Queue.Queue() visited = [] index = {} def search(word): rv = [] matches = [] words = re.findall("[a-zA-Z]\w*", word) if len(words) == 0: return [] for url in index.keys(): for title in index[url].keys(): for word in words: word = word.lower() if word in title or word in index[url][title]: matches.append((title, url)) for pair in matches: if matches.count(pair) == len(words): if pair not in rv: rv.append(pair) return rv if util.is_url_ok_to_follow(starting_url, limiting_domain): urls.put(starting_url) while not urls.empty() and len(visited) < max_num_pages_to_visit: top_queue = urls.get() if top_queue not in visited and util.is_url_ok_to_follow( top_queue, limiting_domain): request = util.get_request(top_queue) if request == None: visited.append(top_queue) continue new_page = util.get_request_url(request) if new_page != top_queue: if new_page not in visited: visited.append(new_page) top_queue = new_page data = bs4.BeautifulSoup(util.read_request(request)) visited.append(top_queue) index = indexer(index, top_queue, data) for link in data.find_all('a'): href = link.get('href') if href == None: continue href = util.remove_fragment(href) if not util.is_absolute_url(href): url = util.convert_if_relative_url(top_queue, href) urls.put(url) else: return None return search
def get_soup_from_url(url): ''' Input: url - absolute url Returns: BeautifulSoup object corresponding to url ''' request = util.get_request(url) if request == None: return None text = util.read_request(request) soup = bs4.BeautifulSoup(text, "html5lib") return soup
def get_restaurant_links_cook(): cities = get_cities() city_state = get_loc_cook() new_city_state = [] for ele in city_state: if ele[0] in cities: new_city_state.append(ele) page_suffix = [i for i in range(0, 231, 10)] #print(city_state) url_list = [] for city, state in city_state: html = "https://www.yelp.com/search?find_desc=Restaurants&find_loc=" + city.replace( " ", "") + "%2C%20" + state for suffix in page_suffix: html_page = html + "&start=" + str(suffix) url_list.append(html_page) r''' with open(r"c:\Users\35653\Desktop\CS122\project\urls.txt", "w") as write_file: write_file.writelines(url_list) write_file.close() ''' url_list = [ "https://www.yelp.com/search?find_desc=Restaurants&find_loc=Lyons%2C%20IL&start=190" ] for url in url_list: request = util.get_request(url) if request: text = util.read_request(request) soup = bs4.BeautifulSoup(text, "html5lib") tags = soup.find_all('a', href=True, target="", role="") # extract href links to restaurants links = [] for tag in tags: link = tag['href'] link = util.convert_if_relative_url(url, link) link = util.remove_fragment(link) # Hardcoded filter if link[-11:] == "Restaurants": if tag["name"] != '': if link not in links: links.append(link + "\n") return links
def creat_model_data(): request_path = r'data\request.txt' vertex_path = r'data\vertex.txt' vehicle_path = r'data\vehicle.txt' requests = util.read_request(request_path) vertexs = util.read_vertex(vertex_path) vehicles = util.read_vehicle(vehicle_path) time_matrix = util.cal_time(vertexs) distance_matrix = time_matrix * 2 tim = multidict({}) for i, j in range(time_matrix.shape[0]): tim[i, j] = time_matrix[i][j] print(tim) print()
def make_soup(url): ''' Makes a soup object from a html request object Inputs: request: a request object of the html Outputs: soup - Soup object, if request is valid url. ''' req = util.get_request(url) html = util.read_request(req) if html is not None and html is not "": soup = bs4.BeautifulSoup(html, "html5lib") return soup return None
def get_soup_object(url): """ Takes a url, checks for possible redirection, returns soup object. Inputs: url (string) Returns: Soup object """ request = util.get_request(url) html_text = util.read_request(request) soup = bs4.BeautifulSoup(html_text, 'html5lib') return soup
def get_soup(url): ''' Returns the soup of the current_market_url. Inputs: url: str Output: BeautifulSoup object ''' time.sleep(0.05) url_request = util.get_request(url) if not url_request: return False html = util.read_request(url_request) if not html: return False return bs4.BeautifulSoup(html, "html5lib")
def get_restaurant_links(): ''' Start from searching "Restaurant", "Chicago" on yelp main page, and collect all restaurant links from 24 pages Input: None Output: links (list): a list of links ''' page_suffix = [i for i in range(0, 231, 10)] url = 'https://www.yelp.com/search?find_desc=Restaurants&find_loc=Chicago%2C%20IL&start=' url_list = [] for suffix in page_suffix: page_url = url + str(suffix) url_list.append(page_url) links = [] count = 0 for url in url_list: count += 1 print(count) request = util.get_request(url) text = util.read_request(request) soup = bs4.BeautifulSoup(text, "html5lib") tags = soup.find_all('a', href=True, target="", role="") for tag in tags: link = tag['href'] link = util.convert_if_relative_url(url, link) link = util.remove_fragment(link) if link[-11:] == "Restaurants": if tag["name"] != '': if link not in links: links.append(link + "\n") print(link) i = 5 + random.random() * 5 time.sleep(i) return links
def get_referees(url): base_url = 'http://www.basketball-reference.com/boxscores/' comp_url = base_url + url + '.html' request = util.get_request(comp_url) if request != None: html = util.read_request(request) if html != None: soup = bs4.BeautifulSoup(html, "html5lib") div_tags = soup.find_all('div') good_tags = str(div_tags) string = re.findall(r'(?<=Officials:)(.*?)(?=\<br)', good_tags) rv = re.findall(r'(?<=.html\"\>)(.*?)(?=<\/a)', string[0]) return rv
def get_paper_links(number_of_articles, fields): ''' Crawls through Nature search pages, and pulls article links from different fields Input: number of articles, int number of articles to find fields, list of field str, all major fields in nature.com/subjects Output: paper_links, list of paper urls (str) ''' search_url = ('https://www.nature.com/search?article_type=protocols\ %2Cresearch%2Creviews&subject=') suffix = '&page=' search_urls = [] paper_links = [] num_articles_per_field = number_of_articles // 8 num_pages_to_visit = int(np.ceil(num_articles_per_field / 50)) num_on_last_page = num_articles_per_field % 50 for field in fields: for i in range(num_pages_to_visit): new_url = search_url + field + suffix + str(i + 1) search_urls.append(new_url) for url in search_urls: num_to_search = 50 if int(url[-1]) == num_pages_to_visit: num_to_search = num_on_last_page new_request = util.get_request(url) html = util.read_request(new_request) search_soup = bs4.BeautifulSoup(html, features = 'html.parser') article_links = search_soup.find_all('h2', class_ = 'h3 extra-tight-line-height', itemprop = 'headline') article_links = article_links[:num_to_search] paper_links.extend([i.find('a')['href'] for i in article_links]) return paper_links
def analyze_page(url, queue, limiting_domain, course_dict): ''' Queues all urls, then makes a dictionary index of the course codes to a list of words in the course description. Inputs: url: the url of the page to analyze queue: the queue that holds the urls limiting_domain: a domain with which to stay in when queuing course_dict: the index dictionary Outputs: None ''' request = util.get_request(url) text = util.read_request(request) soup = bs4.BeautifulSoup(text, "html5lib") queue_urls(url, soup, queue, limiting_domain) find_courses(soup, course_dict)
def translate_general(code): ''' Given an IDC-code, scrapes www.icd10data.com and returns a meaningful translation as a string Input: code (int): an IDC-code Output: rv (string): translation of the IDC-code ''' url = BASE_GEN + str(code) + '&codebook=icd9volume1' ro = util.get_request(url) html = util.read_request(ro) soup = bs4.BeautifulSoup(html, "html5lib") rv = None search = soup.find('div').next_sibling.next_sibling.find('div', class_='searchPadded') if search and search.text: rv = search.text return rv
def get_neighbors(node): print(" completed") neighbors = [] test_link = [] response = util.get_request(node) if response == None: print('No response') neighbors = None else: text = util.read_request(response) if text == "": print("No text read") neighbors = None else: soup = bs4.BeautifulSoup(text, "html5lib") for link in soup.find_all("a"): url_raw = link.get("href") url_rel = util.remove_fragment(url_raw) url = util.convert_if_relative_url(node, url_rel) print(url) if url != None: neighbors.append(url) return neighbors, response, soup
def mini_crawler(url, page_parser_q, pull_info_q, links_visited, limiting_domain, index_list): ''' Crawl the college catalog and adds to an index list to map set of words with associated course identifier. Inputs: url: starting url to begin crawling with q: queue of urls in line to be crawled links_visited: list of visited links limiting_domain: domain name index_list: list of dictionaries for individual programs ''' if url in links_visited: return request = util.get_request(url) if request is None: return post_url = util.get_request_url(request) if post_url in links_visited: return html = util.read_request(request) soup = bs4.BeautifulSoup(html, features="html5lib") find_links(soup, url, post_url, page_parser_q, pull_info_q, links_visited, limiting_domain)
def scrape_chart_data(html): ''' Takes a html file for each neighborhood on Trulia. Extracts the data from a graph of median sale pricing using BeautifulSoup Inputs: html - the html doc Returns: data_points (list of dicts) ''' request = util.get_request(html) doc = util.read_request(request) soup = bs4.BeautifulSoup(doc, 'lxml') scripts = soup.find_all('script') for script in scripts: if 'var CHART_DATA' in script.text: chart = script.text break start = 'medianSalesPoints: ' end = 'salesVolumePoints' chart_list = (chart.split(start))[1].split(end)[0] chart_list = chart_list.replace('\n', '') chart_list = chart_list.replace('null', 'None') while chart_list[-1] == ' ': chart_list = chart_list[:-1] if chart_list[-1] == ',': chart_list = chart_list[:-1] data_by_bed_num = eval(chart_list) data_points = data_by_bed_num[-1]['points'] return data_points
def go(num_pages_to_crawl, course_map_filename, index_filename): ''' Crawl the college catalog and generates a CSV file with an index. Inputs: num_pages_to_crawl: the number of pages to process during the crawl course_map_filename: the name of a JSON file that contains the mapping course codes to course identifiers index_filename: the name for the CSV of the index. Outputs: CSV file of the index. ''' starting_url = "http://www.classes.cs.uchicago.edu/archive/2015/winter/12200-1/new.collegecatalog.uchicago.edu/index.html" limiting_domain = "classes.cs.uchicago.edu" index = {} unique_list = [starting_url] url_queue = queue.Queue() url_queue.put(starting_url) counter = 0 with open(course_map_filename) as data: course_dict = json.load(data) while counter < num_pages_to_crawl and not url_queue.empty(): next_url = url_queue.get() request_obj = util.get_request(next_url) url_text = util.read_request(request_obj) soup = bs4.BeautifulSoup(url_text, "html5lib") next_tag_list = find_tag_list(next_url, soup, \ request_obj, limiting_domain) create_index(soup, course_dict, index) add_to_queue(next_tag_list, url_queue, unique_list) counter += 1 create_csv(index, index_filename)
def nature_crawler(number_of_articles, database_name): ''' Crawls nature.com and their suite of journals to extraact authorship information for the database 'journals.db'. Inputs: number_of_articles, int, the number of articles to extract from the journals Outputs: None, but modifies journals.db ''' conn = sqlite3.connect(database_name) c = conn.cursor() home_domain = "https://www.nature.com" search_urls = [] fields = ['biological-sciences', 'business-and-commerce', 'earth-and-environmental-sciences','health-sciences', 'humanities', 'physical-sciences', 'scientific-community-and-society','social-science'] paper_links = get_paper_links(number_of_articles, fields) for i, link in enumerate(paper_links): try: new_request = util.get_request(home_domain + link) html = util.read_request(new_request) article_soup = bs4.BeautifulSoup(html, features = 'html.parser') authors = article_soup.find_all('meta', {'name':'citation_author'}) #process paper full_title = article_soup.find('title').text.split(' | ') paper_title = full_title[0] print(paper_title) journal = full_title[1] num_authors = len(authors) year_find = article_soup.find('meta', {'name':'dc.date'})['content'] year = year_find.split('-')[0] num_articles_per_field = number_of_articles // 8 field_index = int(np.ceil(i // num_articles_per_field)) field = fields[field_index] insert = (paper_title, year, journal, field, num_authors) except: print('paper extraction failed') continue try: c.execute('INSERT INTO papers(title, year, journal, field, \ num_authors) VALUES (?, ?, ?, ?, ?)', insert) conn.commit() except: print('error, insert already in database') continue fetch = c.execute('SELECT paper_identifier FROM papers WHERE \ title = ?', (paper_title,)) paper_identifier = fetch.fetchone()[0] #process author for rank, author in enumerate(authors): try: name = author['content'].split() last_name = name.pop() first_name = ' '.join(name) gen = gender.get_gender(name[0].strip()) institution, country = get_institution_name(author, authors) insert = (first_name, last_name, institution, gen, country) except: print('unable to extract') continue try: c.execute('INSERT INTO authors(first_name, last_name, \ institution, gender, country) VALUES (?, ?, ?, \ ?, ?)', insert) conn.commit() except: print('author already here') fetch = c.execute('SELECT author_identifier FROM authors WHERE \ first_name = ? AND last_name = ?', (first_name,\ last_name)) author_identifier = fetch.fetchone()[0] insert = (author_identifier, paper_identifier, rank + 1) c.execute('INSERT INTO author_key_rank(author_identifier, \ paper_identifier, rank) VALUES (?, ?, ?)', insert) conn.commit() print(i)
def get_info(links): ''' Extract restaurants name, zipcode, price, cuisine, num of reviews, average rating Input: links (list): a list of links returned by link_crawler.py Output: info (dict): a dictionary of restaurant info ''' info = [] for url in links: print("start at", url) restaurant = {} request = util.get_request(url) if request is not None: text = util.read_request(request) soup = bs4.BeautifulSoup(text, "html5lib") tag = soup.find('script', type='application/ld+json') if tag is not None: d = json.loads(tag.text) restaurant['restaurant_name'] = d['name'].replace(''',"'").\ replace('&', "&") restaurant['zip_code'] = d['address']['postalCode'] if 'priceRange' in d: if d['priceRange'] == "Above $61": restaurant['price'] = 70 elif d['priceRange'] == "Under $10": restaurant['price'] = 5 else: price = re.findall(r'[0-9]+', d['priceRange']) #Average of price upper and lower bounds restaurant['price'] = (float(price[1]) +\ float(price[0])) / 2 else: restaurant['price'] = None restaurant['cuisine'] = d['servesCuisine'].replace( '&', "&") restaurant['num_review'] = len(d["review"]) reviews = d["review"] if len(d['review']) != 0: rating = 0 for review in reviews: rating += review['reviewRating']['ratingValue'] restaurant['rating'] = rating / len(d["review"]) else: restaurant['rating'] = 0 info.append(restaurant) else: print("the link gives an empty tag") else: print("empty request", url) return info
def identify_text(url, filename): '''This is a very long function, with a lot of items in it, so get ready for this explanation. This function takes a url and a filename where a course to course code dictionary can be found and returns a dictionary where entries are of the form: {Course code: course title + course description} Inputs: url -- a string of the url to look for classes on, filename -- a file containing the course code dictionary Outputs: coursetext -- a dictionary of aforementioned form.''' with open(filename, 'r') as fp: course_dict = json.load(fp) #establish course_dict as course to course code dictionary request = util.get_request(url) assert request != None text = util.read_request(request) soup = bs4.BeautifulSoup(text, "html5lib") divs = soup.find_all("div", class_="courseblock main") coursetext = {} coursenames = [] for div in divs: desc = [] desc += (div.find_all("p", class_="courseblocktitle")) desc += (div.find_all("p", class_="courseblockdesc")) courses = util.find_sequence(div) if courses != []: #courses in sequences fall here desc.append([]) desc.append([]) for div in courses: desc[2] += (div.find_all("p", class_="courseblocktitle")) desc[3] += div.find_all("p", class_="courseblockdesc") n = 0 for course in desc[2]: #since we are on sequence path, desc[2] has html #for all courses in a sequence course_name = course.text course_name = str(course_name).split() course_title = course_name[2:] course_name = course_name[0] + ' ' + course_name[1][0:5] #course_name is now of form Department Code *space* Course number course_code = course_dict[course_name] course_text = desc[3][n].text main_title = desc[ 0].text #main_title is sequence title, as opposed to course title main_desc = desc[ 1].text #main_desc is sequence description, as opposed to course description course_text = course_title + str(course_text).split() course_text = course_text + str(main_desc).split() + str( main_title).split() coursetext[course_code] = (' '.join(course_text)).lower() #Okay so I know the split, join thing seems counter-productive, #but it's to prevent splitting every letter. I want the text as a big block. #Also, course_text is a long string that is sequence title + desc + course title + desc. #Coursetext is a dictionary and course_text is the info in it. n += 1 else: #courses not in sequences fall here course_name = desc[0].text course_name = str(course_name).split() course_title = course_name[2:] course_name = course_name[0] + ' ' + course_name[1][0:5] course_code = course_dict[course_name] course_text = desc[1].text course_text = course_title + str(course_text).split() coursetext[course_code] = (' '.join(course_text)).lower() #See if statement path to address confusion, they are similar but NOT identical in procedure. return coursetext