def main(): NUM_THREADS = 4 if (len(sys.argv) <= 2) : print("usage is domain-pattern seed-url [max-num-pages-visited] ") print(" -w domain-pattern") print(" | ") print(" ^ ") print(" Ex: nist.gov http://math.nist.gov 100 ") #print(" -w means to continue from a webcrawl dump (fed into stdin)") print(" ") sys.exit(2) links_to_visit = set([]) links_already_dispatched = set([]) max_num_page_visited = 0 #if 0, then there is no limit if (sys.argv[1] == "-w"): #start from a previous crawl print('loading file') #process_wg_file(sys.stdin, links_already_dispatched, hash_codes_already_visited, links_to_visit) url_matching_pattern = sys.argv[2] ###### if resuming index creation, need to add call here ###### read_index_files() ###### else: url_matching_pattern = sys.argv[1] starting_url = sys.argv[2] links_to_visit.add(starting_url) if (len(sys.argv) > 3): max_num_page_visited = int(sys.argv[3]) print("#!# domain pattern: ", url_matching_pattern) print(" ") # go crawl the web... # num_page, num_edges = \ consumer( sys.stdout, url_matching_pattern, max_num_page_visited, \ links_to_visit, links_already_dispatched, hash_codes_already_visited) ############################################ # add call here to write results of index creation to file DJS Oct 2015 write_index() ############################################ print("\n[-- DONE --]\n", file=sys.stdout) print("read ", num_page, " pages.", file=sys.stdout) print("number of edges : ", num_edges, file=sys.stdout)
def main(): NUM_THREADS = 4 if len(sys.argv) <= 2: print("usage is domain-pattern seed-url [max-num-pages-visited] ") print(" -w domain-pattern") print(" | ") print(" ^ ") print(" Ex: nist.gov http://math.nist.gov 100 ") print(" -w means to continue from a webcrawl dump (fed into stdin)") print(" ") sys.exit(2) links_to_visit = set([]) links_already_dispatched = set([]) max_num_page_visited = 0 # if 0, then there is no limit if sys.argv[1] == "-w": # sart from a previous crawl process_wg_file(sys.stdin, links_already_dispatched, hash_codes_already_visited, links_to_visit) url_matching_pattern = sys.argv[2] ###### if resuming index creation, need to add call here ###### else: url_matching_pattern = sys.argv[1] starting_url = sys.argv[2] links_to_visit.add(starting_url) if len(sys.argv) > 3: max_num_page_visited = int(sys.argv[3]) print("#!# domain pattern: ", url_matching_pattern) print(" ") # go crawl the web... # num_page, num_edges = consumer( sys.stdout, url_matching_pattern, max_num_page_visited, links_to_visit, links_already_dispatched, hash_codes_already_visited, ) ############################################ # add call here to write results of index creation to file DJS Oct 2015 write_index() ############################################ print("\n[-- DONE --]\n", file=sys.stdout) print("read ", num_page, " pages.", file=sys.stdout) print("number of edges : ", num_edges, file=sys.stdout)
def consumer( filestream, url_matching_pattern, max_num_page_visited, \ links_to_visit, links_already_dispatched, \ hash_codes_already_visited) : num_edges = 0 num_page = 0 while (len(links_to_visit) > 0) and \ ((max_num_page_visited < 1) or (num_page < max_num_page_visited)): # here is where we wait for the producer() # url = links_to_visit.pop() timestamp,canonical_url,page_contents = producer(url, links_already_dispatched) # mark canonical links also as "seen" # if (url != canonical_url) : links_already_dispatched.add(canonical_url) num_page += 1 links_to_follow = process_webpage(num_page, timestamp, url, canonical_url, page_contents, links_already_dispatched) num_edges += len(links_to_follow) #print ("consumer: url_matching_pattern =", url_matching_pattern) ### start of added block ### for link in links_to_follow: if (link.find(url_matching_pattern) == -1): continue if (re.search('\.pdf', link)): # kludge to remove pdfs DJS Nov2017 continue if (link not in links_already_dispatched): #print ("consumer:link =", link) links_to_visit.add(link) ### end of added block ### # original commented out DJS Oct 2015 #for link in links_to_follow: # if link not in links_already_dispatched: # links_to_visit.add(link) if num_page%100 == 0: print("Saving index") write_index() return num_page, num_edges