def process_webpage(num_page, timestamp, url, canonical_url, page_contents, links_already_dispatched): global hash_codes_already_visted, filestream, url_matching_pattern # if retrival was not succesful (due to permission, password protection) # then print the url and its error code, and continue with next page # if (page_contents in url_errors.URL_errors): print_error_record(filestream, num_page, timestamp, url, canonical_url, page_contents) return [] #otherwise, we have a valid page page # hash_code = hashlib.sha1( page_contents).hexdigest() #uncommented DJS 30/9/15 #hash_code = sha.new(page_contents).hexdigest() #commented out DJS 30/9/15 seq_timestamp = datetime.now().strftime("%Y-%m-%d:%H:%M:%S") #print_header_record(filestream, num_page, len(page_contents), timestamp, url, canonical_url ) #commented out DJS 30/9/15 # did we see this hash_code already (under a different url)? # if so, then prefix with '!' and skip printing the links # if (hash_code in hash_codes_already_visited): print("!" + hash_code, file=filestream) print("", file=filestream) return [] else: #print >> filestream, hash_code #commented out DJS 30/9/15 #print line to show what's being retrieved - for demonstration only #print(num_page, len(page_contents), url, page_contents) #added DJS 30/9/15 #print(num_page, len(page_contents), url) #added DJS 30/9/15 ##################### add code here DJS 30/9/15 ################## make_index(url, page_contents) #added DJS Oct 2015 ################################################################## page_links = extract_all_href_links(page_contents, canonical_url) follow_links = decide_which_links_to_follow(url_matching_pattern, terminal_extensions, canonical_url, url, page_links) #print_links(filestream, follow_links) # commented out DJS Oct 15 #print('') # commented out DJS Oct 15 hash_codes_already_visited.add(hash_code) return follow_links
def process_webpage(num_page, timestamp, url, canonical_url, page_contents, links_already_dispatched): global hash_codes_already_visted, filestream, url_matching_pattern # if retrival was not succesful (due to permission, password protection) # then print the url and its error code, and continue with next page # if page_contents in url_errors.URL_errors: print_error_record(filestream, num_page, timestamp, url, canonical_url, page_contents) return [] # otherwise, we have a valid page page # hash_code = hashlib.sha1(page_contents).hexdigest() # uncommented DJS 30/9/15 # hash_code = sha.new(page_contents).hexdigest() #commented out DJS 30/9/15 seq_timestamp = datetime.now().strftime("%Y-%m-%d:%H:%M:%S") # print_header_record(filestream, num_page, len(page_contents), timestamp, url, canonical_url ) #commented out DJS 30/9/15 # did we see this hash_code already (under a different url)? # if so, then prefix with '!' and skip printing the links # if hash_code in hash_codes_already_visited: print("!" + hash_code, file=filestream) print("", file=filestream) return [] else: # print >> filestream, hash_code #commented out DJS 30/9/15 # print line to show what's being retrieved - for demonstration only # print(num_page, len(page_contents), url, page_contents) #added DJS 30/9/15 # print(num_page, len(page_contents), url) #added DJS 30/9/15 ##################### add code here DJS 30/9/15 ################## make_index(url, page_contents) # added DJS Oct 2015 ################################################################## page_links = extract_all_href_links(page_contents, canonical_url) follow_links = decide_which_links_to_follow( url_matching_pattern, terminal_extensions, canonical_url, url, page_links ) # print_links(filestream, follow_links) # commented out DJS Oct 15 # print('') # commented out DJS Oct 15 hash_codes_already_visited.add(hash_code) return follow_links
def make_index_and_urls(self): """ make_index_and_urls(self): Make an index and a url-map from scratch. """ c = crawler.Crawler() c.crawl() self._urls = c.get_page_urls() preprocessor.clean_and_tokenize_all() self._index = indexer.make_index(tagged=False)