Esempio n. 1
0
def get_restaurant_links_chicago():
    # start from searching "Restaurant", "Chicago" from yelp main page
    page_suffix = [i for i in range(0, 231, 10)]
    url = 'https://www.yelp.com/search?find_desc=Restaurants&find_loc=Chicago%2C%20IL&start='
    url_list = []

    for suffix in page_suffix:
        page_url = url + str(suffix)
        url_list.append(page_url)

    links = []

    for url in url_list:

        request = util.get_request(url)
        text = util.read_request(request)

        soup = bs4.BeautifulSoup(text, "html5lib")
        tags = soup.find_all('a', href=True, target="", role="")

        # extract href links to restaurants
        links = []
        for tag in tags:
            link = tag['href']
            link = util.convert_if_relative_url(url, link)
            link = util.remove_fragment(link)
            # Hardcoded filter
            if link[-11:] == "Restaurants":
                if tag["name"] != '':
                    if link not in links:
                        links.append(link)

    return links
Esempio n. 2
0
def build_search_engine(starting_url, limiting_domain, max_num_pages_to_visit):
    urls = Queue.Queue()
    visited = []
    index = {}

    def search(word):
        rv = []
        matches = []
        words = re.findall("[a-zA-Z]\w*", word)
        if len(words) == 0:
            return []
        for url in index.keys():
            for title in index[url].keys():
                for word in words:
                    word = word.lower()
                    if word in title or word in index[url][title]:
                        matches.append((title, url))
        for pair in matches:
            if matches.count(pair) == len(words):
                if pair not in rv:
                    rv.append(pair)
        return rv

    if util.is_url_ok_to_follow(starting_url, limiting_domain):
        urls.put(starting_url)
        while not urls.empty() and len(visited) < max_num_pages_to_visit:
            top_queue = urls.get()
            if top_queue not in visited and util.is_url_ok_to_follow(
                    top_queue, limiting_domain):
                request = util.get_request(top_queue)
                if request == None:
                    visited.append(top_queue)
                    continue
                new_page = util.get_request_url(request)
                if new_page != top_queue:
                    if new_page not in visited:
                        visited.append(new_page)
                        top_queue = new_page
                data = bs4.BeautifulSoup(util.read_request(request))
                visited.append(top_queue)
                index = indexer(index, top_queue, data)
                for link in data.find_all('a'):
                    href = link.get('href')
                    if href == None:
                        continue
                    href = util.remove_fragment(href)
                    if not util.is_absolute_url(href):
                        url = util.convert_if_relative_url(top_queue, href)
                    urls.put(url)
    else:
        return None
    return search
Esempio n. 3
0
def url_check(url, parent_url):
    '''
    Takes a url, the limiting domain, and its parent url, and does various checks on the url, 
    returning the url in the correct format if it is ok to use and returning None if not.
    '''

    if not util.is_absolute_url(url):
        url = util.convert_if_relative_url(parent_url, url)
    url = util.remove_fragment(url)
    if url:
        return url
    else:
        return None
Esempio n. 4
0
def get_clean_urls(url, limiting_domain):
    '''
    Given a soup for a webpage, create and return a list of all 
    'a' tag urls in that soup that have been cleaned (absolute urls only)
    and 'ok' to follow

    Inputs:
        url - absolute url 
        limiting_domain - domain name
        used_links - list of links already visited
    Outputs:
        list of absolute urls
    '''
    soup = get_soup_from_url(url)
    all_a_tags = soup.find_all("a")
    # get urls (i.e. if tag has 'href' attribute)
    clean_urls = []
    for tag in all_a_tags:
        if tag.has_attr('href'):
            absolute = util.convert_if_relative_url(url, tag['href'])
            if not util.is_url_ok_to_follow(absolute, limiting_domain):
                continue
            util.remove_fragment(absolute)
            # protocol field reversion
            temp_request = util.get_request(absolute)
            if temp_request == None:
                continue
            reverted_url = util.get_request_url(temp_request)
            # is url ok to follow based on specification in PA2
            if util.is_url_ok_to_follow(reverted_url, limiting_domain):
                clean_urls.append(reverted_url)
    # remove duplicates
    final_url_list = []
    for link in clean_urls:
        if link not in final_url_list:
            final_url_list.append(link)

    return final_url_list
Esempio n. 5
0
def get_restaurant_links_cook():
    cities = get_cities()

    city_state = get_loc_cook()
    new_city_state = []
    for ele in city_state:
        if ele[0] in cities:
            new_city_state.append(ele)

    page_suffix = [i for i in range(0, 231, 10)]
    #print(city_state)

    url_list = []
    for city, state in city_state:
        html = "https://www.yelp.com/search?find_desc=Restaurants&find_loc=" + city.replace(
            " ", "") + "%2C%20" + state
        for suffix in page_suffix:
            html_page = html + "&start=" + str(suffix)
            url_list.append(html_page)
    r'''
    with open(r"c:\Users\35653\Desktop\CS122\project\urls.txt", "w") as write_file:
        write_file.writelines(url_list)

        write_file.close()
    '''

    url_list = [
        "https://www.yelp.com/search?find_desc=Restaurants&find_loc=Lyons%2C%20IL&start=190"
    ]
    for url in url_list:
        request = util.get_request(url)
        if request:

            text = util.read_request(request)

            soup = bs4.BeautifulSoup(text, "html5lib")
            tags = soup.find_all('a', href=True, target="", role="")

            # extract href links to restaurants
            links = []
            for tag in tags:
                link = tag['href']
                link = util.convert_if_relative_url(url, link)
                link = util.remove_fragment(link)
                # Hardcoded filter
                if link[-11:] == "Restaurants":
                    if tag["name"] != '':
                        if link not in links:
                            links.append(link + "\n")
    return links
Esempio n. 6
0
def calendar_scraper(url, limiting_domain):
    '''
    Extracts links from a given url.

    Inputs:
        url - (string) url from which to get 
        limiting_domain: (string) that links must match
        visited_links: (list) of already visited sites 
    Outputs:
        links - list of strings, non-repetead and not previously visited
                links
        soup - soup object corresponding to visited url (to be used for 
                getting words)
    '''
    #A. Extracting links
    req = util.get_request(url)
    url2 = util.get_request_url(req)
    soup = make_soup(req)

    if soup:
        cal = []
        cal_list = soup.find_all("div", class_="panel panel_default")
        for d in div_list:
            d_tr = util.remove_fragment(link.get("href"))
            d_abs = util.convert_if_relative_url(url2, d_tr)
            if util.is_url_ok_to_follow(d_abs, limiting_domain):
                cal.append(d_abs)

        art = []
        art_list = soup.find_all("div", class_="panel panel_default")
        for d in div_list:
            d_tr = util.remove_fragment(link.get("href"))
            d_abs = util.convert_if_relative_url(url2, d_tr)
            if util.is_url_ok_to_follow(d_abs, limiting_domain):
                cal.append(d_abs)
    return cal, art
Esempio n. 7
0
def get_restaurant_links():
    '''
    Start from searching "Restaurant", "Chicago" on yelp main page,
    and collect all restaurant links from 24 pages

    Input:
        None

    Output:
        links (list): a list of links
    '''

    page_suffix = [i for i in range(0, 231, 10)]
    url = 'https://www.yelp.com/search?find_desc=Restaurants&find_loc=Chicago%2C%20IL&start='
    url_list = []

    for suffix in page_suffix:
        page_url = url + str(suffix)
        url_list.append(page_url)

    links = []
    count = 0

    for url in url_list:
        count += 1
        print(count)

        request = util.get_request(url)
        text = util.read_request(request)

        soup = bs4.BeautifulSoup(text, "html5lib")
        tags = soup.find_all('a', href=True, target="", role="")

        for tag in tags:
            link = tag['href']
            link = util.convert_if_relative_url(url, link)
            link = util.remove_fragment(link)
            if link[-11:] == "Restaurants":
                if tag["name"] != '':
                    if link not in links:
                        links.append(link + "\n")
                        print(link)

        i = 5 + random.random() * 5
        time.sleep(i)

    return links
Esempio n. 8
0
File: scrape.py Progetto: ccr122/ccr
def scrape():
    '''
    performs entire scraping function
    outputs:
        index: dictionary of museum/exhibit information 
    '''
    index = {}
    for museum_id in scrape_dict:
        limiter = scrape_dict[museum_id]['limiter']
        pages = scrape_dict[museum_id]['page']
        exhibit_urls = []
        for page in pages:
            r = requests.get(page)
            soup = bs4.BeautifulSoup(r.text, "html5lib")
            for link in soup.find_all('a', href=True):
                u = util.convert_if_relative_url(page, link['href'])
                u = util.remove_fragment(u)
                restr = scrape_dict[museum_id]['restr']
                crawl(limiter, exhibit_urls, u, restr)

        index[museum_id] = {}
        exhibit_id = museum_id + '01'

        for link in exhibit_urls:
            r = requests.get(link)
            soup = bs4.BeautifulSoup(r.text, "html5lib")
            print(link)
            try:
                index[museum_id][exhibit_id] = {}
                scrape_dict[museum_id]['info'](soup, index[museum_id],
                                               exhibit_id)
                index[museum_id][exhibit_id]['url'] = link
                exhibit_id = '00' + str(int(exhibit_id) + 1)
            except:
                print('\t^^ Scraper Failed')

        with open('../csvs/musid_name.csv', 'w') as f:
            line = 'mus_id|name' + '\n'
            f.write(line)
            for mus_id in scrape_dict:
                line = '{}|{}\n'.format(str(mus_id), \
                    scrape_dict[mus_id]['name'])
                f.write(line)

    return index
Esempio n. 9
0
File: scrape.py Progetto: ccr122/ccr
def scrape():
    '''
    performs entire scraping function
    outputs:
        index: dictionary of museum/exhibit information 
    '''
    index = {}
    for museum_id in scrape_dict:
        limiter = scrape_dict[museum_id]['limiter']
        pages = scrape_dict[museum_id]['page']
        exhibit_urls = []
        for page in pages:
            r = requests.get(page)
            soup = bs4.BeautifulSoup(r.text, "html5lib")
            for link in soup.find_all('a', href=True):
                u = util.convert_if_relative_url(page, link['href'])
                u = util.remove_fragment(u)
                restr = scrape_dict[museum_id]['restr']
                crawl(limiter, exhibit_urls, u, restr)

        index[museum_id] = {}
        exhibit_id = museum_id + '01'

        for link in exhibit_urls:
            r = requests.get(link)
            soup = bs4.BeautifulSoup(r.text,"html5lib")
            print(link)
            try:
                index[museum_id][exhibit_id] = {}
                scrape_dict[museum_id]['info'](soup, index[museum_id], exhibit_id)
                index[museum_id][exhibit_id]['url'] = link
                exhibit_id = '00' + str(int(exhibit_id) + 1)
            except:
                print('\t^^ Scraper Failed')

        
        with open('../csvs/musid_name.csv','w') as f:
            line = 'mus_id|name' + '\n'
            f.write(line)
            for mus_id in scrape_dict:
                line = '{}|{}\n'.format(str(mus_id), \
                    scrape_dict[mus_id]['name'])
                f.write(line) 

    return index
Esempio n. 10
0
def clean_url(url, limiting_domain, parent_url):
    '''
    Cleans the given url, if necessary.

    Inputs:
        url: (string) A url
        limiting_domain: (string) The limiting domain of the url.
        parent_url: (string) The partent url if the given url is incomplete.

    Outputs:
        The cleaned url if it is ok to follow, and None otherwise.  
    '''

    c_url = util.remove_fragment(url)
    c_url = util.convert_if_relative_url(parent_url, c_url)

    if util.is_url_ok_to_follow(c_url, limiting_domain):
        return c_url

    return None
Esempio n. 11
0
def get_all_links(soup, url):
    '''
    funcion take a soup object  and will give a list of all relevent urls/links
    for this assignmnet in a list
    Input:
        soup (bs4 object): a bs4 soup object of the wabe page
        url (strl): the url from the bs4 soup was construted,
                    (This is required because to convert into absolute url)
    Output:
        all_links (list): a list of ready to go urls
    '''
    all_links = []

    mega_set = soup.find_all('a')
    for link in mega_set:
        if link.has_attr('href'):
            link = util.remove_fragment(link['href'])
            abs_link = util.convert_if_relative_url(url, link)
            if abs_link not in all_links:
                all_links.append(abs_link)

    return all_links
Esempio n. 12
0
def queue_urls(url, soup, queue, limiting_domain):
    '''
    Forms a queue of all the urls

    Inputs:
        url: the url to put into the queue
        soup: BeautifulSoup object
        queue: the existing queue
        limiting_domain: a domain with which to stay in when queuing

    Outputs:
        None
    '''
    for link in soup.find_all('a'):
        clean_url = util.convert_if_relative_url(
            url, util.remove_fragment(link.get('href')))

        if util.is_absolute_url(clean_url) and str(clean_url)[0] != 'b':
            if (util.is_url_ok_to_follow(
                    clean_url,
                    limiting_domain)) and clean_url not in queue.all_items:
                queue.enqueue(clean_url)
Esempio n. 13
0
def get_neighbors(node):
    print("      completed")
    neighbors = []
    test_link = []
    response = util.get_request(node)
    if response == None:
        print('No response')
        neighbors = None
    else:
        text = util.read_request(response)
        if text == "":
            print("No text read")
            neighbors = None
        else:
            soup = bs4.BeautifulSoup(text, "html5lib")
            for link in soup.find_all("a"):
                url_raw = link.get("href")
                url_rel = util.remove_fragment(url_raw)
                url = util.convert_if_relative_url(node, url_rel)
                print(url)
                if url != None:
                    neighbors.append(url)
    return neighbors, response, soup
Esempio n. 14
0
def get_lyrics(cnx, cursor, url=STARTING_URL, visited_pages=set()):

	print("\ntrying", url)
	visited_pages.add(url)
	print("visited_pages length: {}".format(len(visited_pages)))

	try:
		url, soup = open_page(url)

	except Exception:
		print("there was an exception - you done f****d up\n")
		return None

	if not url:
		print("no url")
		return None

	## base case
	if url[-4:] == '.txt':# or len(visited_pages) > 2:
		if soup.find('pre'):
			text = soup.find('pre').text
			print("adding {}\n".format(url))
		else:
			ps = soup.find_all('p')
			ps = list(filter(lambda x: 'Artist:' in x.text, ps))
			if len(ps)==1:
				text = ps[0].text
			else:
				text = soup.text

		cursor.execute(ADD_RAW_TEXT, (url, text))
		cnx.commit()
		# return [(url, text)]

	## recursive case
	else:
		# print("reached recursive case")
		lyrics = []
		if soup.find('div', id='leftmain'):
			tag = soup.find('div', id='leftmain')
		else:
			tag = soup
		new_links = tag.find_all('a', href=True)

		for link in new_links:

			# if len(visited_pages) > 30:
			# 	continue

			### check if link can be followed
			# if 'href' not in link.attrs:
			# 	print("href not in attrs")
			# 	continue
			
			# print(link['href'])

			clean_link = util.remove_fragment(link['href'])
			# print("link: {}".format(clean_link))

			if not clean_link:
				print("no clean link")
				continue
			
			abs_link = util.convert_if_relative_url(url, clean_link)

			if abs_link in visited_pages or not util.is_url_ok_to_follow(abs_link, LIMITING_DOMAIN) or 'update' in abs_link:
				print("child link shan't be followed: {}".format(abs_link))
				continue

			get_lyrics(cnx, cursor, abs_link, visited_pages)