Exemple #1
0
def process_webpage(num_page, timestamp, url, canonical_url, page_contents,
                    links_already_dispatched):

    global hash_codes_already_visted, filestream, url_matching_pattern

    # if retrival was not succesful (due to permission, password protection)
    # then print the url and its error code, and continue with next page
    #
    if (page_contents in url_errors.URL_errors):
        print_error_record(filestream, num_page, timestamp, url, canonical_url,
                           page_contents)
        return []

    #otherwise, we have a valid page page
    #
    hash_code = hashlib.sha1(
        page_contents).hexdigest()  #uncommented  DJS 30/9/15
    #hash_code = sha.new(page_contents).hexdigest()	#commented out DJS 30/9/15

    seq_timestamp = datetime.now().strftime("%Y-%m-%d:%H:%M:%S")

    #print_header_record(filestream, num_page, len(page_contents), timestamp, url, canonical_url ) #commented out DJS 30/9/15

    # did we see this hash_code already (under a different url)?
    # if so, then prefix with '!' and skip printing the links
    #
    if (hash_code in hash_codes_already_visited):
        print("!" + hash_code, file=filestream)
        print("", file=filestream)
        return []
    else:
        #print >> filestream, hash_code #commented out DJS 30/9/15

        #print line to show what's being retrieved - for demonstration only
        #print(num_page, len(page_contents), url, page_contents) #added DJS 30/9/15
        #print(num_page, len(page_contents), url) #added DJS 30/9/15

        ##################### add code here DJS 30/9/15 ##################

        make_index(url, page_contents)  #added DJS Oct 2015

##################################################################

    page_links = extract_all_href_links(page_contents, canonical_url)
    follow_links = decide_which_links_to_follow(url_matching_pattern,
                                                terminal_extensions,
                                                canonical_url, url, page_links)

    #print_links(filestream, follow_links) # commented out DJS Oct 15
    #print('') # commented out DJS Oct 15

    hash_codes_already_visited.add(hash_code)

    return follow_links
Exemple #2
0
def process_webpage(num_page, timestamp, url, canonical_url, page_contents, links_already_dispatched):

    global hash_codes_already_visted, filestream, url_matching_pattern

    # if retrival was not succesful (due to permission, password protection)
    # then print the url and its error code, and continue with next page
    #
    if page_contents in url_errors.URL_errors:
        print_error_record(filestream, num_page, timestamp, url, canonical_url, page_contents)
        return []

        # otherwise, we have a valid page page
        #
    hash_code = hashlib.sha1(page_contents).hexdigest()  # uncommented  DJS 30/9/15
    # hash_code = sha.new(page_contents).hexdigest()	#commented out DJS 30/9/15

    seq_timestamp = datetime.now().strftime("%Y-%m-%d:%H:%M:%S")

    # print_header_record(filestream, num_page, len(page_contents), timestamp, url, canonical_url ) #commented out DJS 30/9/15

    # did we see this hash_code already (under a different url)?
    # if so, then prefix with '!' and skip printing the links
    #
    if hash_code in hash_codes_already_visited:
        print("!" + hash_code, file=filestream)
        print("", file=filestream)
        return []
    else:
        # print >> filestream, hash_code #commented out DJS 30/9/15

        # print line to show what's being retrieved - for demonstration only
        # print(num_page, len(page_contents), url, page_contents) #added DJS 30/9/15
        # print(num_page, len(page_contents), url) #added DJS 30/9/15

        ##################### add code here DJS 30/9/15 ##################

        make_index(url, page_contents)  # added DJS Oct 2015

    ##################################################################

    page_links = extract_all_href_links(page_contents, canonical_url)
    follow_links = decide_which_links_to_follow(
        url_matching_pattern, terminal_extensions, canonical_url, url, page_links
    )

    # print_links(filestream, follow_links) # commented out DJS Oct 15
    # print('') # commented out DJS Oct 15

    hash_codes_already_visited.add(hash_code)

    return follow_links
 def make_index_and_urls(self):
     """
     make_index_and_urls(self): Make an index and a url-map from scratch.
     """
     c = crawler.Crawler()
     c.crawl()
     self._urls = c.get_page_urls()
     preprocessor.clean_and_tokenize_all()
     self._index = indexer.make_index(tagged=False)