def get_restaurant_links_chicago(): # start from searching "Restaurant", "Chicago" from yelp main page page_suffix = [i for i in range(0, 231, 10)] url = 'https://www.yelp.com/search?find_desc=Restaurants&find_loc=Chicago%2C%20IL&start=' url_list = [] for suffix in page_suffix: page_url = url + str(suffix) url_list.append(page_url) links = [] for url in url_list: request = util.get_request(url) text = util.read_request(request) soup = bs4.BeautifulSoup(text, "html5lib") tags = soup.find_all('a', href=True, target="", role="") # extract href links to restaurants links = [] for tag in tags: link = tag['href'] link = util.convert_if_relative_url(url, link) link = util.remove_fragment(link) # Hardcoded filter if link[-11:] == "Restaurants": if tag["name"] != '': if link not in links: links.append(link) return links
def build_search_engine(starting_url, limiting_domain, max_num_pages_to_visit): urls = Queue.Queue() visited = [] index = {} def search(word): rv = [] matches = [] words = re.findall("[a-zA-Z]\w*", word) if len(words) == 0: return [] for url in index.keys(): for title in index[url].keys(): for word in words: word = word.lower() if word in title or word in index[url][title]: matches.append((title, url)) for pair in matches: if matches.count(pair) == len(words): if pair not in rv: rv.append(pair) return rv if util.is_url_ok_to_follow(starting_url, limiting_domain): urls.put(starting_url) while not urls.empty() and len(visited) < max_num_pages_to_visit: top_queue = urls.get() if top_queue not in visited and util.is_url_ok_to_follow( top_queue, limiting_domain): request = util.get_request(top_queue) if request == None: visited.append(top_queue) continue new_page = util.get_request_url(request) if new_page != top_queue: if new_page not in visited: visited.append(new_page) top_queue = new_page data = bs4.BeautifulSoup(util.read_request(request)) visited.append(top_queue) index = indexer(index, top_queue, data) for link in data.find_all('a'): href = link.get('href') if href == None: continue href = util.remove_fragment(href) if not util.is_absolute_url(href): url = util.convert_if_relative_url(top_queue, href) urls.put(url) else: return None return search
def url_check(url, parent_url): ''' Takes a url, the limiting domain, and its parent url, and does various checks on the url, returning the url in the correct format if it is ok to use and returning None if not. ''' if not util.is_absolute_url(url): url = util.convert_if_relative_url(parent_url, url) url = util.remove_fragment(url) if url: return url else: return None
def get_clean_urls(url, limiting_domain): ''' Given a soup for a webpage, create and return a list of all 'a' tag urls in that soup that have been cleaned (absolute urls only) and 'ok' to follow Inputs: url - absolute url limiting_domain - domain name used_links - list of links already visited Outputs: list of absolute urls ''' soup = get_soup_from_url(url) all_a_tags = soup.find_all("a") # get urls (i.e. if tag has 'href' attribute) clean_urls = [] for tag in all_a_tags: if tag.has_attr('href'): absolute = util.convert_if_relative_url(url, tag['href']) if not util.is_url_ok_to_follow(absolute, limiting_domain): continue util.remove_fragment(absolute) # protocol field reversion temp_request = util.get_request(absolute) if temp_request == None: continue reverted_url = util.get_request_url(temp_request) # is url ok to follow based on specification in PA2 if util.is_url_ok_to_follow(reverted_url, limiting_domain): clean_urls.append(reverted_url) # remove duplicates final_url_list = [] for link in clean_urls: if link not in final_url_list: final_url_list.append(link) return final_url_list
def get_restaurant_links_cook(): cities = get_cities() city_state = get_loc_cook() new_city_state = [] for ele in city_state: if ele[0] in cities: new_city_state.append(ele) page_suffix = [i for i in range(0, 231, 10)] #print(city_state) url_list = [] for city, state in city_state: html = "https://www.yelp.com/search?find_desc=Restaurants&find_loc=" + city.replace( " ", "") + "%2C%20" + state for suffix in page_suffix: html_page = html + "&start=" + str(suffix) url_list.append(html_page) r''' with open(r"c:\Users\35653\Desktop\CS122\project\urls.txt", "w") as write_file: write_file.writelines(url_list) write_file.close() ''' url_list = [ "https://www.yelp.com/search?find_desc=Restaurants&find_loc=Lyons%2C%20IL&start=190" ] for url in url_list: request = util.get_request(url) if request: text = util.read_request(request) soup = bs4.BeautifulSoup(text, "html5lib") tags = soup.find_all('a', href=True, target="", role="") # extract href links to restaurants links = [] for tag in tags: link = tag['href'] link = util.convert_if_relative_url(url, link) link = util.remove_fragment(link) # Hardcoded filter if link[-11:] == "Restaurants": if tag["name"] != '': if link not in links: links.append(link + "\n") return links
def calendar_scraper(url, limiting_domain): ''' Extracts links from a given url. Inputs: url - (string) url from which to get limiting_domain: (string) that links must match visited_links: (list) of already visited sites Outputs: links - list of strings, non-repetead and not previously visited links soup - soup object corresponding to visited url (to be used for getting words) ''' #A. Extracting links req = util.get_request(url) url2 = util.get_request_url(req) soup = make_soup(req) if soup: cal = [] cal_list = soup.find_all("div", class_="panel panel_default") for d in div_list: d_tr = util.remove_fragment(link.get("href")) d_abs = util.convert_if_relative_url(url2, d_tr) if util.is_url_ok_to_follow(d_abs, limiting_domain): cal.append(d_abs) art = [] art_list = soup.find_all("div", class_="panel panel_default") for d in div_list: d_tr = util.remove_fragment(link.get("href")) d_abs = util.convert_if_relative_url(url2, d_tr) if util.is_url_ok_to_follow(d_abs, limiting_domain): cal.append(d_abs) return cal, art
def get_restaurant_links(): ''' Start from searching "Restaurant", "Chicago" on yelp main page, and collect all restaurant links from 24 pages Input: None Output: links (list): a list of links ''' page_suffix = [i for i in range(0, 231, 10)] url = 'https://www.yelp.com/search?find_desc=Restaurants&find_loc=Chicago%2C%20IL&start=' url_list = [] for suffix in page_suffix: page_url = url + str(suffix) url_list.append(page_url) links = [] count = 0 for url in url_list: count += 1 print(count) request = util.get_request(url) text = util.read_request(request) soup = bs4.BeautifulSoup(text, "html5lib") tags = soup.find_all('a', href=True, target="", role="") for tag in tags: link = tag['href'] link = util.convert_if_relative_url(url, link) link = util.remove_fragment(link) if link[-11:] == "Restaurants": if tag["name"] != '': if link not in links: links.append(link + "\n") print(link) i = 5 + random.random() * 5 time.sleep(i) return links
def scrape(): ''' performs entire scraping function outputs: index: dictionary of museum/exhibit information ''' index = {} for museum_id in scrape_dict: limiter = scrape_dict[museum_id]['limiter'] pages = scrape_dict[museum_id]['page'] exhibit_urls = [] for page in pages: r = requests.get(page) soup = bs4.BeautifulSoup(r.text, "html5lib") for link in soup.find_all('a', href=True): u = util.convert_if_relative_url(page, link['href']) u = util.remove_fragment(u) restr = scrape_dict[museum_id]['restr'] crawl(limiter, exhibit_urls, u, restr) index[museum_id] = {} exhibit_id = museum_id + '01' for link in exhibit_urls: r = requests.get(link) soup = bs4.BeautifulSoup(r.text, "html5lib") print(link) try: index[museum_id][exhibit_id] = {} scrape_dict[museum_id]['info'](soup, index[museum_id], exhibit_id) index[museum_id][exhibit_id]['url'] = link exhibit_id = '00' + str(int(exhibit_id) + 1) except: print('\t^^ Scraper Failed') with open('../csvs/musid_name.csv', 'w') as f: line = 'mus_id|name' + '\n' f.write(line) for mus_id in scrape_dict: line = '{}|{}\n'.format(str(mus_id), \ scrape_dict[mus_id]['name']) f.write(line) return index
def scrape(): ''' performs entire scraping function outputs: index: dictionary of museum/exhibit information ''' index = {} for museum_id in scrape_dict: limiter = scrape_dict[museum_id]['limiter'] pages = scrape_dict[museum_id]['page'] exhibit_urls = [] for page in pages: r = requests.get(page) soup = bs4.BeautifulSoup(r.text, "html5lib") for link in soup.find_all('a', href=True): u = util.convert_if_relative_url(page, link['href']) u = util.remove_fragment(u) restr = scrape_dict[museum_id]['restr'] crawl(limiter, exhibit_urls, u, restr) index[museum_id] = {} exhibit_id = museum_id + '01' for link in exhibit_urls: r = requests.get(link) soup = bs4.BeautifulSoup(r.text,"html5lib") print(link) try: index[museum_id][exhibit_id] = {} scrape_dict[museum_id]['info'](soup, index[museum_id], exhibit_id) index[museum_id][exhibit_id]['url'] = link exhibit_id = '00' + str(int(exhibit_id) + 1) except: print('\t^^ Scraper Failed') with open('../csvs/musid_name.csv','w') as f: line = 'mus_id|name' + '\n' f.write(line) for mus_id in scrape_dict: line = '{}|{}\n'.format(str(mus_id), \ scrape_dict[mus_id]['name']) f.write(line) return index
def clean_url(url, limiting_domain, parent_url): ''' Cleans the given url, if necessary. Inputs: url: (string) A url limiting_domain: (string) The limiting domain of the url. parent_url: (string) The partent url if the given url is incomplete. Outputs: The cleaned url if it is ok to follow, and None otherwise. ''' c_url = util.remove_fragment(url) c_url = util.convert_if_relative_url(parent_url, c_url) if util.is_url_ok_to_follow(c_url, limiting_domain): return c_url return None
def get_all_links(soup, url): ''' funcion take a soup object and will give a list of all relevent urls/links for this assignmnet in a list Input: soup (bs4 object): a bs4 soup object of the wabe page url (strl): the url from the bs4 soup was construted, (This is required because to convert into absolute url) Output: all_links (list): a list of ready to go urls ''' all_links = [] mega_set = soup.find_all('a') for link in mega_set: if link.has_attr('href'): link = util.remove_fragment(link['href']) abs_link = util.convert_if_relative_url(url, link) if abs_link not in all_links: all_links.append(abs_link) return all_links
def queue_urls(url, soup, queue, limiting_domain): ''' Forms a queue of all the urls Inputs: url: the url to put into the queue soup: BeautifulSoup object queue: the existing queue limiting_domain: a domain with which to stay in when queuing Outputs: None ''' for link in soup.find_all('a'): clean_url = util.convert_if_relative_url( url, util.remove_fragment(link.get('href'))) if util.is_absolute_url(clean_url) and str(clean_url)[0] != 'b': if (util.is_url_ok_to_follow( clean_url, limiting_domain)) and clean_url not in queue.all_items: queue.enqueue(clean_url)
def get_neighbors(node): print(" completed") neighbors = [] test_link = [] response = util.get_request(node) if response == None: print('No response') neighbors = None else: text = util.read_request(response) if text == "": print("No text read") neighbors = None else: soup = bs4.BeautifulSoup(text, "html5lib") for link in soup.find_all("a"): url_raw = link.get("href") url_rel = util.remove_fragment(url_raw) url = util.convert_if_relative_url(node, url_rel) print(url) if url != None: neighbors.append(url) return neighbors, response, soup
def get_lyrics(cnx, cursor, url=STARTING_URL, visited_pages=set()): print("\ntrying", url) visited_pages.add(url) print("visited_pages length: {}".format(len(visited_pages))) try: url, soup = open_page(url) except Exception: print("there was an exception - you done f****d up\n") return None if not url: print("no url") return None ## base case if url[-4:] == '.txt':# or len(visited_pages) > 2: if soup.find('pre'): text = soup.find('pre').text print("adding {}\n".format(url)) else: ps = soup.find_all('p') ps = list(filter(lambda x: 'Artist:' in x.text, ps)) if len(ps)==1: text = ps[0].text else: text = soup.text cursor.execute(ADD_RAW_TEXT, (url, text)) cnx.commit() # return [(url, text)] ## recursive case else: # print("reached recursive case") lyrics = [] if soup.find('div', id='leftmain'): tag = soup.find('div', id='leftmain') else: tag = soup new_links = tag.find_all('a', href=True) for link in new_links: # if len(visited_pages) > 30: # continue ### check if link can be followed # if 'href' not in link.attrs: # print("href not in attrs") # continue # print(link['href']) clean_link = util.remove_fragment(link['href']) # print("link: {}".format(clean_link)) if not clean_link: print("no clean link") continue abs_link = util.convert_if_relative_url(url, clean_link) if abs_link in visited_pages or not util.is_url_ok_to_follow(abs_link, LIMITING_DOMAIN) or 'update' in abs_link: print("child link shan't be followed: {}".format(abs_link)) continue get_lyrics(cnx, cursor, abs_link, visited_pages)