def main(): NUM_THREADS = 4 if len(sys.argv) <= 2: print("usage is domain-pattern seed-url [max-num-pages-visited] ") print(" -w domain-pattern") print(" | ") print(" ^ ") print(" Ex: nist.gov http://math.nist.gov 100 ") print(" -w means to continue from a webcrawl dump (fed into stdin)") print(" ") sys.exit(2) links_to_visit = set([]) links_already_dispatched = set([]) max_num_page_visited = 0 # if 0, then there is no limit if sys.argv[1] == "-w": # sart from a previous crawl process_wg_file(sys.stdin, links_already_dispatched, hash_codes_already_visited, links_to_visit) url_matching_pattern = sys.argv[2] ###### if resuming index creation, need to add call here ###### else: url_matching_pattern = sys.argv[1] starting_url = sys.argv[2] links_to_visit.add(starting_url) if len(sys.argv) > 3: max_num_page_visited = int(sys.argv[3]) print("#!# domain pattern: ", url_matching_pattern) print(" ") # go crawl the web... # num_page, num_edges = consumer( sys.stdout, url_matching_pattern, max_num_page_visited, links_to_visit, links_already_dispatched, hash_codes_already_visited, ) ############################################ # add call here to write results of index creation to file DJS Oct 2015 write_index() ############################################ print("\n[-- DONE --]\n", file=sys.stdout) print("read ", num_page, " pages.", file=sys.stdout) print("number of edges : ", num_edges, file=sys.stdout)
def main(): NUM_THREADS = 4 if (len(sys.argv) <= 2) : print("usage is domain-pattern seed-url [max-num-pages-visited] ") print(" -w domain-pattern") print(" | ") print(" ^ ") print(" Ex: nist.gov http://math.nist.gov 100 ") print(" -w means to continue from a webcrawl dump (fed into stdin)") print(" ") sys.exit(2) links_to_visit = set([]) links_already_dispatched = set([]) max_num_page_visited = 0 #if 0, then there is no limit if (sys.argv[1] == "-w"): #sart from a previous crawl process_wg_file(sys.stdin, links_already_dispatched, \ hash_codes_already_visited, links_to_visit ) url_matching_pattern = sys.argv[2] ###### if resuming index creation, need to add call here ###### else: url_matching_pattern = sys.argv[1] starting_url = sys.argv[2] links_to_visit.add(starting_url) if (len(sys.argv) > 3): max_num_page_visited = int(sys.argv[3]) print("#!# domain pattern: ", url_matching_pattern) print(" ") # go crawl the web... # num_page, num_edges = \ consumer( sys.stdout, url_matching_pattern, max_num_page_visited, \ links_to_visit, links_already_dispatched, hash_codes_already_visited) ############################################ # add call here to write results of index creation to file DJS Oct 2015 write_index() ############################################ print("\n[-- DONE --]\n", file=sys.stdout) print("read ", num_page, " pages.", file=sys.stdout) print("number of edges : ", num_edges, file=sys.stdout)
def main(): NUM_THREADS = 4 if len(sys.argv) <= 2: print "usage is domain-pattern seed-url [max-num-pages-visited] " print " -w domain-pattern" print " | " print " ^ " print " Ex: nist.gov http://math.nist.gov 100 " print " -w means to continue from a webcrawl dump (fed into stdin)" print " " sys.exit(2) links_to_visit = set([]) links_already_dispatched = set([]) max_num_page_visited = 0 #if 0, then there is no limit if sys.argv[1] == "-w": #sart from a previous crawl process_wg_file(sys.stdin, links_already_dispatched, \ hash_codes_already_visited, links_to_visit ) url_matching_pattern = sys.argv[2] else: url_matching_pattern = sys.argv[1] starting_url = sys.argv[2] links_to_visit.add(starting_url) if len(sys.argv) > 3: max_num_page_visited = int(sys.argv[3]) print "#!# domain pattern: ", url_matching_pattern print " " # go crawl the web... # num_page, num_edges = \ consumer( sys.stdout, url_matching_pattern, max_num_page_visited, \ links_to_visit, links_already_dispatched, hash_codes_already_visited) print >> sys.stdout, "\n[-- DONE --]\n" print >> sys.stdout, "read ", num_page, " pages." print >> sys.stdout, "number of edges : ", num_edges print_frontier(sys.stdout, links_to_visit)
def main(): global url_matching_pattern global links_dispatched global hash_codes_visited timeout = 10 # 10-second timeout for fetching URLs socket.setdefaulttimeout(timeout) # NUM_THREADS = 128 # NUM_THREADS = 32 # NUM_THREADS = 8 NUM_THREADS = 2 # read wg files,figure out url_matching_pattern, and max_num_pages, if needed # if len(sys.argv) <= 2: print "usage is domain-pattern seed-url [max-num-pages-visited] " print " -w domain-pattern" print " | " print " ^ " print " Ex: nist.gov http://math.nist.gov 100 " print " -w means to continue from a webcrawl dump (fed into stdin)" print " " sys.exit(2) max_pages_to_visit = 0 #if 0, then there is no limit seed_urls = set([]) if sys.argv[1] == "-w": #sart from a previous crawl readwg.process_wg_file(sys.stdin, links_dispatched, \ hash_codes_visited, seed_urls ) url_matching_pattern = sys.argv[2] if len(sys.argv) > 3: max_pages_to_visit = int(sys.argv[3]) else: url_matching_pattern = sys.argv[1] starting_url = sys.argv[2] seed_urls.add(starting_url) if len(sys.argv) > 3: max_pages_to_visit = int(sys.argv[3]) print "#!# domain pattern: ", url_matching_pattern print " " links_dispatched = seed_urls try: C = parallel_crawler(NUM_THREADS, process_webpage, seed_urls, \ max_pages_to_visit) except (KeyboardInterrupt, SystemExit): #C.terminate() print "Ctrl-C hit: exiting... " print "" exit print "[-- DONE --]" print "num_pages:", C.num_pages_crawled() print "" print "Active urls being fetched: " for a in C.active_urls_being_fetched(): print a