def mini_crawler(url, page_parser_q, pull_info_q, links_visited, limiting_domain, index_list, parsing_default_domain): ''' Crawl the college catalog and adds to an index dictionary to map set of words with associated course identifier. Inputs: url: starting url to begin crawling with q: queue of urls in line to be crawled links_visited: list of visited links limiting_domain: domain name index_list: dictionary that maps words to course identifiers ''' if url in links_visited: return request = util.get_request(url) if request is None: return post_url = util.get_request_url(request) if post_url in links_visited: return html = util.read_request(request) soup = bs4.BeautifulSoup(html, features="html5lib") find_links(soup, url, post_url, pull_info_q, links_visited, limiting_domain) tag_list = soup.find_all("ul", attrs = {"class": "pagination"}) current_page = tag_list[0].find_all("li", attrs = {"class": "current"}) next_page = current_page[0].next_sibling.next_sibling.findChild() next_page_href = next_page.get('href') next_page_href = util.convert_if_relative_url(post_url, next_page_href) page_parser_q.put(next_page_href)
def mini_crawler(url, page_parser_q, pull_info_q, links_visited, limiting_domain, index_list): ''' Crawl the college catalog and adds to an index list to map set of words with associated course identifier. Inputs: url: starting url to begin crawling with q: queue of urls in line to be crawled links_visited: list of visited links limiting_domain: domain name index_list: list of dictionaries for each webpage ''' if url in links_visited: return request = util.get_request(url) if request is None: return post_url = util.get_request_url(request) if post_url in links_visited: return html = util.read_request(request) soup = bs4.BeautifulSoup(html, features="html5lib") find_links(soup, url, post_url, pull_info_q, links_visited, limiting_domain) tag_list = soup.find_all("ul", attrs={"class": "Pagination"}) pages = tag_list[0].find_all("li") pages = pages[1:] for page in pages: page_parser_q.put(page.findChild().get('href'))
def find_tag_list(url, soup_object, request_obj, limiting_d): ''' Function finds the list of hyperlink tags from given URL and collects the list of URLs. Inputs: url (string): URL to find list of further URLs from soup_object (soup object): Soup object obtained from using Beautiful Soup on given URL request_obj (request object): Request object obtained from the functions given in PA to obtain request object from URL limiting_d (string): Limiting domain for the URLs Outputs: url_list (list): List containing all URLs that are able to be followed from the given URL. ''' https_url = util.get_request_url(request_obj) ahref_tag_list = soup_object.find_all('a', href = True) url_list = [] for tag in ahref_tag_list: this_url = tag['href'] newest_url = util.convert_if_relative_url(https_url, this_url) if util.is_url_ok_to_follow(newest_url, limiting_d): url_list.append(newest_url) return url_list
def create_dictionary(num_pages_to_crawl, course_map_filename, starting_url, limiting_domain): ''' Creates the dictionary mapping course id numbers to the words in the course titles and descriptions. Inputs: num_pages_to_crawl: (int) The number of pages to process during the crawl. course_map_filename: (string) The name of the JSON file that contains the mapping of course codes to course ids. starting_url: (string) The url of the first page that the crawler visits. limiting_domain: (string) The limiting domain of the url. Outputs: The dictionary mapping course id numbers to the words in the course titles and descriptions. ''' with open(course_map_filename) as json_file: coursemap = json.load(json_file) url_list = [] url_queue = queue.Queue() num_pages = 0 course_dict = {} process_dict = {} starting_url = clean_url(starting_url, limiting_domain, parent_url=None) if starting_url: url_queue.put(starting_url) while num_pages < num_pages_to_crawl and not url_queue.empty(): num_pages += 1 next_url = url_queue.get() if next_url and next_url not in url_list: request = util.get_request(next_url) if request: request_url = util.get_request_url(request) if request_url and request_url not in url_list: url_list.append(next_url) if request_url not in url_list: url_list.append(request_url) html_text = util.read_request(request) soup = bs4.BeautifulSoup(html_text, "html5lib") process_dict.update(find_course_info(soup, coursemap,\ course_dict)) if process_dict: course_dict.update(process_dict) href_list = soup.find_all("a", href=True) for h in href_list: h_url = h['href'] h_url = clean_url(h_url, limiting_domain, request_url) if h_url: url_queue.put(h_url) return course_dict
def build_search_engine(starting_url, limiting_domain, max_num_pages_to_visit): urls = Queue.Queue() visited = [] index = {} def search(word): rv = [] matches = [] words = re.findall("[a-zA-Z]\w*", word) if len(words) == 0: return [] for url in index.keys(): for title in index[url].keys(): for word in words: word = word.lower() if word in title or word in index[url][title]: matches.append((title, url)) for pair in matches: if matches.count(pair) == len(words): if pair not in rv: rv.append(pair) return rv if util.is_url_ok_to_follow(starting_url, limiting_domain): urls.put(starting_url) while not urls.empty() and len(visited) < max_num_pages_to_visit: top_queue = urls.get() if top_queue not in visited and util.is_url_ok_to_follow( top_queue, limiting_domain): request = util.get_request(top_queue) if request == None: visited.append(top_queue) continue new_page = util.get_request_url(request) if new_page != top_queue: if new_page not in visited: visited.append(new_page) top_queue = new_page data = bs4.BeautifulSoup(util.read_request(request)) visited.append(top_queue) index = indexer(index, top_queue, data) for link in data.find_all('a'): href = link.get('href') if href == None: continue href = util.remove_fragment(href) if not util.is_absolute_url(href): url = util.convert_if_relative_url(top_queue, href) urls.put(url) else: return None return search
def crawl(url, limit): ''' Takes a starting url and subsequently visits links that are found in that page and in the following visited ones. Updates a dictionary by mapping words to course codes if information about courses is found in the followed urls. Inputs: url - (string) url from which to get limiting_domain: (string) that links must match limit: (int) total number of sites to be visited course_map_filename - (json file) maps course code to course id Outputs: d_words - completed dictionary after crawl ''' q = queue.Queue() limiting_domain = "federalreserve.gov/monetarypolicy/" visited_links = [url] q.put(url) count = 0 while q.empty() == False and count <= limit: links, soup = calendar_scraper(q.get(), limiting_domain, visited_links) # extract_words(soup, d_words, course_map_filename) if len(links) == 0: continue else: for link in links: count += 1 req = util.get_request(link) link2 = util.get_request_url(req) if link2 in visited_links: continue q.put(link2) visited_links.append(link2) visited_links.append(link) return visited_links
def get_clean_urls(url, limiting_domain): ''' Given a soup for a webpage, create and return a list of all 'a' tag urls in that soup that have been cleaned (absolute urls only) and 'ok' to follow Inputs: url - absolute url limiting_domain - domain name used_links - list of links already visited Outputs: list of absolute urls ''' soup = get_soup_from_url(url) all_a_tags = soup.find_all("a") # get urls (i.e. if tag has 'href' attribute) clean_urls = [] for tag in all_a_tags: if tag.has_attr('href'): absolute = util.convert_if_relative_url(url, tag['href']) if not util.is_url_ok_to_follow(absolute, limiting_domain): continue util.remove_fragment(absolute) # protocol field reversion temp_request = util.get_request(absolute) if temp_request == None: continue reverted_url = util.get_request_url(temp_request) # is url ok to follow based on specification in PA2 if util.is_url_ok_to_follow(reverted_url, limiting_domain): clean_urls.append(reverted_url) # remove duplicates final_url_list = [] for link in clean_urls: if link not in final_url_list: final_url_list.append(link) return final_url_list
def mini_crawler(url, page_parser_q, pull_info_q, links_visited, limiting_domain, index_list): ''' Crawl the college catalog and adds to an index list to map set of words with associated course identifier. Inputs: url: starting url to begin crawling with q: queue of urls in line to be crawled links_visited: list of visited links limiting_domain: domain name index_list: list of dictionaries for individual programs ''' if url in links_visited: return request = util.get_request(url) if request is None: return post_url = util.get_request_url(request) if post_url in links_visited: return html = util.read_request(request) soup = bs4.BeautifulSoup(html, features="html5lib") find_links(soup, url, post_url, page_parser_q, pull_info_q, links_visited, limiting_domain)
def calendar_scraper(url, limiting_domain): ''' Extracts links from a given url. Inputs: url - (string) url from which to get limiting_domain: (string) that links must match visited_links: (list) of already visited sites Outputs: links - list of strings, non-repetead and not previously visited links soup - soup object corresponding to visited url (to be used for getting words) ''' #A. Extracting links req = util.get_request(url) url2 = util.get_request_url(req) soup = make_soup(req) if soup: cal = [] cal_list = soup.find_all("div", class_="panel panel_default") for d in div_list: d_tr = util.remove_fragment(link.get("href")) d_abs = util.convert_if_relative_url(url2, d_tr) if util.is_url_ok_to_follow(d_abs, limiting_domain): cal.append(d_abs) art = [] art_list = soup.find_all("div", class_="panel panel_default") for d in div_list: d_tr = util.remove_fragment(link.get("href")) d_abs = util.convert_if_relative_url(url2, d_tr) if util.is_url_ok_to_follow(d_abs, limiting_domain): cal.append(d_abs) return cal, art