Esempio n. 1
0
def mini_crawler(url, page_parser_q, pull_info_q, links_visited, limiting_domain, index_list, parsing_default_domain):
    '''
    Crawl the college catalog and adds to an index dictionary to map set of
    words with associated course identifier.

    Inputs:
        url: starting url to begin crawling with
        q: queue of urls in line to be crawled
        links_visited: list of visited links
        limiting_domain: domain name
        index_list: dictionary that maps words to course identifiers
    '''
    if url in links_visited:
        return
    request = util.get_request(url)
    if request is None:
        return
    post_url = util.get_request_url(request)
    if post_url in links_visited:
        return
    html = util.read_request(request)
    soup = bs4.BeautifulSoup(html, features="html5lib")
    find_links(soup, url, post_url, pull_info_q, links_visited, limiting_domain)
    tag_list = soup.find_all("ul", attrs = {"class": "pagination"})
    current_page = tag_list[0].find_all("li", attrs = {"class": "current"})
    next_page = current_page[0].next_sibling.next_sibling.findChild()
    next_page_href = next_page.get('href')
    next_page_href = util.convert_if_relative_url(post_url, next_page_href)
    page_parser_q.put(next_page_href)
Esempio n. 2
0
def mini_crawler(url, page_parser_q, pull_info_q, links_visited,
                 limiting_domain, index_list):
    '''
    Crawl the college catalog and adds to an index list to map set of
    words with associated course identifier.

    Inputs:
        url: starting url to begin crawling with
        q: queue of urls in line to be crawled
        links_visited: list of visited links
        limiting_domain: domain name
        index_list: list of dictionaries for each webpage
    '''
    if url in links_visited:
        return
    request = util.get_request(url)
    if request is None:
        return
    post_url = util.get_request_url(request)
    if post_url in links_visited:
        return
    html = util.read_request(request)
    soup = bs4.BeautifulSoup(html, features="html5lib")
    find_links(soup, url, post_url, pull_info_q, links_visited,
               limiting_domain)
    tag_list = soup.find_all("ul", attrs={"class": "Pagination"})
    pages = tag_list[0].find_all("li")
    pages = pages[1:]
    for page in pages:
        page_parser_q.put(page.findChild().get('href'))
Esempio n. 3
0
def find_tag_list(url, soup_object, request_obj, limiting_d):
    '''
    Function finds the list of hyperlink tags from given URL and collects
    the list of URLs. 

    Inputs:
        url (string): URL to find list of further URLs from
        soup_object (soup object): Soup object obtained from using 
        Beautiful Soup on given URL
        request_obj (request object): Request object obtained from the functions 
        given in PA to obtain request object from URL 
        limiting_d (string): Limiting domain for the URLs

    Outputs:
        url_list (list): List containing all URLs that are able
        to be followed from the given URL. 

    '''
    https_url = util.get_request_url(request_obj)
    ahref_tag_list = soup_object.find_all('a', href = True)

    url_list = []

    for tag in ahref_tag_list:
        this_url = tag['href']
        newest_url = util.convert_if_relative_url(https_url, this_url)
        if util.is_url_ok_to_follow(newest_url, limiting_d):
            url_list.append(newest_url)
    
    return url_list
Esempio n. 4
0
def create_dictionary(num_pages_to_crawl, course_map_filename, starting_url,
                      limiting_domain):
    '''
    Creates the dictionary mapping course id numbers to the words in the
    course titles and descriptions.

    Inputs:
        num_pages_to_crawl: (int) The number of pages to process
                            during the crawl.
        course_map_filename: (string) The name of the JSON file that contains
                             the mapping of course codes to course ids.
        starting_url: (string) The url of the first page that the
                      crawler visits.
        limiting_domain: (string) The limiting domain of the url.

    Outputs:
        The dictionary mapping course id numbers to the words in the course 
        titles and descriptions.
    '''

    with open(course_map_filename) as json_file:
        coursemap = json.load(json_file)

    url_list = []
    url_queue = queue.Queue()
    num_pages = 0
    course_dict = {}
    process_dict = {}

    starting_url = clean_url(starting_url, limiting_domain, parent_url=None)

    if starting_url:
        url_queue.put(starting_url)

    while num_pages < num_pages_to_crawl and not url_queue.empty():
        num_pages += 1
        next_url = url_queue.get()
        if next_url and next_url not in url_list:
            request = util.get_request(next_url)
            if request:
                request_url = util.get_request_url(request)
                if request_url and request_url not in url_list:
                    url_list.append(next_url)
                    if request_url not in url_list:
                        url_list.append(request_url)
                    html_text = util.read_request(request)
                    soup = bs4.BeautifulSoup(html_text, "html5lib")
                    process_dict.update(find_course_info(soup, coursemap,\
                    course_dict))
                    if process_dict:
                        course_dict.update(process_dict)
                    href_list = soup.find_all("a", href=True)
                    for h in href_list:
                        h_url = h['href']
                        h_url = clean_url(h_url, limiting_domain, request_url)
                        if h_url:
                            url_queue.put(h_url)

    return course_dict
Esempio n. 5
0
def build_search_engine(starting_url, limiting_domain, max_num_pages_to_visit):
    urls = Queue.Queue()
    visited = []
    index = {}

    def search(word):
        rv = []
        matches = []
        words = re.findall("[a-zA-Z]\w*", word)
        if len(words) == 0:
            return []
        for url in index.keys():
            for title in index[url].keys():
                for word in words:
                    word = word.lower()
                    if word in title or word in index[url][title]:
                        matches.append((title, url))
        for pair in matches:
            if matches.count(pair) == len(words):
                if pair not in rv:
                    rv.append(pair)
        return rv

    if util.is_url_ok_to_follow(starting_url, limiting_domain):
        urls.put(starting_url)
        while not urls.empty() and len(visited) < max_num_pages_to_visit:
            top_queue = urls.get()
            if top_queue not in visited and util.is_url_ok_to_follow(
                    top_queue, limiting_domain):
                request = util.get_request(top_queue)
                if request == None:
                    visited.append(top_queue)
                    continue
                new_page = util.get_request_url(request)
                if new_page != top_queue:
                    if new_page not in visited:
                        visited.append(new_page)
                        top_queue = new_page
                data = bs4.BeautifulSoup(util.read_request(request))
                visited.append(top_queue)
                index = indexer(index, top_queue, data)
                for link in data.find_all('a'):
                    href = link.get('href')
                    if href == None:
                        continue
                    href = util.remove_fragment(href)
                    if not util.is_absolute_url(href):
                        url = util.convert_if_relative_url(top_queue, href)
                    urls.put(url)
    else:
        return None
    return search
Esempio n. 6
0
def crawl(url, limit):
    '''
    Takes a starting url and subsequently visits links that are found in that
    page and in the following visited ones. Updates a dictionary by mapping 
    words to course codes if information about courses is found in the 
    followed urls. 

    Inputs: 
        url - (string) url from which to get 
        limiting_domain: (string) that links must match
        limit: (int) total number of sites to be visited
        course_map_filename - (json file) maps course code 
                            to course id

    Outputs: 
        d_words - completed dictionary after crawl
    '''
    q = queue.Queue()
    limiting_domain = "federalreserve.gov/monetarypolicy/"
    visited_links = [url]
    q.put(url)
    count = 0
    while q.empty() == False and count <= limit:
        links, soup = calendar_scraper(q.get(), limiting_domain, visited_links)
        # extract_words(soup, d_words, course_map_filename)
        if len(links) == 0:
            continue
        else:
            for link in links:
                count += 1
                req = util.get_request(link)
                link2 = util.get_request_url(req)
                if link2 in visited_links:
                    continue
                q.put(link2)
                visited_links.append(link2)
                visited_links.append(link)
    return visited_links
Esempio n. 7
0
def get_clean_urls(url, limiting_domain):
    '''
    Given a soup for a webpage, create and return a list of all 
    'a' tag urls in that soup that have been cleaned (absolute urls only)
    and 'ok' to follow

    Inputs:
        url - absolute url 
        limiting_domain - domain name
        used_links - list of links already visited
    Outputs:
        list of absolute urls
    '''
    soup = get_soup_from_url(url)
    all_a_tags = soup.find_all("a")
    # get urls (i.e. if tag has 'href' attribute)
    clean_urls = []
    for tag in all_a_tags:
        if tag.has_attr('href'):
            absolute = util.convert_if_relative_url(url, tag['href'])
            if not util.is_url_ok_to_follow(absolute, limiting_domain):
                continue
            util.remove_fragment(absolute)
            # protocol field reversion
            temp_request = util.get_request(absolute)
            if temp_request == None:
                continue
            reverted_url = util.get_request_url(temp_request)
            # is url ok to follow based on specification in PA2
            if util.is_url_ok_to_follow(reverted_url, limiting_domain):
                clean_urls.append(reverted_url)
    # remove duplicates
    final_url_list = []
    for link in clean_urls:
        if link not in final_url_list:
            final_url_list.append(link)

    return final_url_list
Esempio n. 8
0
def mini_crawler(url, page_parser_q, pull_info_q, links_visited, limiting_domain, index_list):
    '''
    Crawl the college catalog and adds to an index list to map set of
    words with associated course identifier.

    Inputs:
        url: starting url to begin crawling with
        q: queue of urls in line to be crawled
        links_visited: list of visited links
        limiting_domain: domain name
        index_list: list of dictionaries for individual programs
    '''
    if url in links_visited:
        return
    request = util.get_request(url)
    if request is None:
        return
    post_url = util.get_request_url(request)
    if post_url in links_visited:
        return
    html = util.read_request(request)
    soup = bs4.BeautifulSoup(html, features="html5lib")
    find_links(soup, url, post_url, page_parser_q, pull_info_q, links_visited, limiting_domain)
Esempio n. 9
0
def calendar_scraper(url, limiting_domain):
    '''
    Extracts links from a given url.

    Inputs:
        url - (string) url from which to get 
        limiting_domain: (string) that links must match
        visited_links: (list) of already visited sites 
    Outputs:
        links - list of strings, non-repetead and not previously visited
                links
        soup - soup object corresponding to visited url (to be used for 
                getting words)
    '''
    #A. Extracting links
    req = util.get_request(url)
    url2 = util.get_request_url(req)
    soup = make_soup(req)

    if soup:
        cal = []
        cal_list = soup.find_all("div", class_="panel panel_default")
        for d in div_list:
            d_tr = util.remove_fragment(link.get("href"))
            d_abs = util.convert_if_relative_url(url2, d_tr)
            if util.is_url_ok_to_follow(d_abs, limiting_domain):
                cal.append(d_abs)

        art = []
        art_list = soup.find_all("div", class_="panel panel_default")
        for d in div_list:
            d_tr = util.remove_fragment(link.get("href"))
            d_abs = util.convert_if_relative_url(url2, d_tr)
            if util.is_url_ok_to_follow(d_abs, limiting_domain):
                cal.append(d_abs)
    return cal, art