Ejemplo n.º 1
0
def main():

    NUM_THREADS = 4
    if len(sys.argv) <= 2:
        print("usage is domain-pattern seed-url  [max-num-pages-visited] ")
        print("     -w  domain-pattern")
        print("              | ")
        print("              ^ ")
        print(" Ex:  nist.gov http://math.nist.gov 100 ")
        print("    -w means to continue from a webcrawl dump  (fed into stdin)")
        print(" ")
        sys.exit(2)

    links_to_visit = set([])
    links_already_dispatched = set([])
    max_num_page_visited = 0  # if 0, then there is no limit

    if sys.argv[1] == "-w":  # sart from a previous crawl
        process_wg_file(sys.stdin, links_already_dispatched, hash_codes_already_visited, links_to_visit)
        url_matching_pattern = sys.argv[2]
        ###### if resuming index creation, need to add call here ######
    else:
        url_matching_pattern = sys.argv[1]
        starting_url = sys.argv[2]
        links_to_visit.add(starting_url)
    if len(sys.argv) > 3:
        max_num_page_visited = int(sys.argv[3])

    print("#!#  domain pattern: ", url_matching_pattern)
    print(" ")

    # go crawl the web...
    #
    num_page, num_edges = consumer(
        sys.stdout,
        url_matching_pattern,
        max_num_page_visited,
        links_to_visit,
        links_already_dispatched,
        hash_codes_already_visited,
    )

    ############################################
    # 	add call here to write results of index creation to file DJS Oct 2015
    write_index()

    ############################################

    print("\n[-- DONE --]\n", file=sys.stdout)
    print("read ", num_page, " pages.", file=sys.stdout)
    print("number of edges : ", num_edges, file=sys.stdout)
Ejemplo n.º 2
0
def main():

	NUM_THREADS = 4
	if (len(sys.argv) <= 2)  :
		print("usage is domain-pattern seed-url  [max-num-pages-visited] ")
		print("     -w  domain-pattern")
		print("              | ")
		print("              ^ ")
		print(" Ex:  nist.gov http://math.nist.gov 100 ")
		print("    -w means to continue from a webcrawl dump  (fed into stdin)")
		print(" ")
		sys.exit(2)

	links_to_visit = set([])
	links_already_dispatched = set([])
	max_num_page_visited = 0     #if 0, then there is no limit


	if (sys.argv[1] == "-w"):    #sart from a previous crawl
		process_wg_file(sys.stdin, links_already_dispatched, \
			hash_codes_already_visited, links_to_visit )
		url_matching_pattern = sys.argv[2]
		###### if resuming index creation, need to add call here ######
	else:
		url_matching_pattern = sys.argv[1]
		starting_url = sys.argv[2]
		links_to_visit.add(starting_url)
	if (len(sys.argv) > 3):
		max_num_page_visited = int(sys.argv[3])
 
	print("#!#  domain pattern: ", url_matching_pattern)
	print(" ")


	# go crawl the web...
	#
	num_page, num_edges = \
	consumer( sys.stdout, url_matching_pattern, max_num_page_visited, \
		links_to_visit,  links_already_dispatched, hash_codes_already_visited)

############################################
#	add call here to write results of index creation to file DJS Oct 2015
	write_index()

############################################
  
	print("\n[-- DONE --]\n", file=sys.stdout)
	print("read ", num_page,  " pages.", file=sys.stdout)
	print("number of edges : ", num_edges, file=sys.stdout)
def main():

    NUM_THREADS = 4

    if len(sys.argv) <= 2:
        print "usage is domain-pattern seed-url  [max-num-pages-visited] "
        print "     -w  domain-pattern"
        print "              | "
        print "              ^ "
        print " Ex:  nist.gov http://math.nist.gov 100 "
        print "    -w means to continue from a webcrawl dump  (fed into stdin)"
        print " "

        sys.exit(2)

    links_to_visit = set([])
    links_already_dispatched = set([])
    max_num_page_visited = 0  #if 0, then there is no limit

    if sys.argv[1] == "-w":  #sart from a previous crawl
        process_wg_file(sys.stdin, links_already_dispatched, \
                       hash_codes_already_visited, links_to_visit )
        url_matching_pattern = sys.argv[2]
    else:
        url_matching_pattern = sys.argv[1]
        starting_url = sys.argv[2]
        links_to_visit.add(starting_url)
        if len(sys.argv) > 3:
            max_num_page_visited = int(sys.argv[3])

    print "#!#  domain pattern: ", url_matching_pattern
    print " "

    # go crawl the web...
    #
    num_page, num_edges = \
      consumer( sys.stdout, url_matching_pattern, max_num_page_visited, \
         links_to_visit,  links_already_dispatched, hash_codes_already_visited)

    print >> sys.stdout, "\n[-- DONE --]\n"
    print >> sys.stdout, "read ", num_page, " pages."
    print >> sys.stdout, "number of edges : ", num_edges
    print_frontier(sys.stdout, links_to_visit)
def main():

    global url_matching_pattern
    global links_dispatched
    global hash_codes_visited

    timeout = 10  # 10-second timeout for fetching URLs
    socket.setdefaulttimeout(timeout)

    # NUM_THREADS = 128
    # NUM_THREADS = 32
    # NUM_THREADS = 8

    NUM_THREADS = 2

    # read wg files,figure out url_matching_pattern, and max_num_pages, if needed
    #
    if len(sys.argv) <= 2:
        print "usage is domain-pattern seed-url  [max-num-pages-visited] "
        print "     -w  domain-pattern"
        print "              | "
        print "              ^ "
        print " Ex:  nist.gov http://math.nist.gov 100 "
        print "    -w means to continue from a webcrawl dump  (fed into stdin)"
        print " "

        sys.exit(2)

    max_pages_to_visit = 0  #if 0, then there is no limit

    seed_urls = set([])

    if sys.argv[1] == "-w":  #sart from a previous crawl
        readwg.process_wg_file(sys.stdin, links_dispatched, \
                       hash_codes_visited, seed_urls )
        url_matching_pattern = sys.argv[2]
        if len(sys.argv) > 3:
            max_pages_to_visit = int(sys.argv[3])
    else:
        url_matching_pattern = sys.argv[1]
        starting_url = sys.argv[2]
        seed_urls.add(starting_url)
        if len(sys.argv) > 3:
            max_pages_to_visit = int(sys.argv[3])

    print "#!#  domain pattern: ", url_matching_pattern
    print " "

    links_dispatched = seed_urls

    try:

        C = parallel_crawler(NUM_THREADS, process_webpage, seed_urls, \
          max_pages_to_visit)

    except (KeyboardInterrupt, SystemExit):

        #C.terminate()
        print "Ctrl-C hit: exiting... "
        print ""
        exit

    print "[-- DONE --]"
    print "num_pages:", C.num_pages_crawled()
    print ""
    print "Active urls being fetched: "
    for a in C.active_urls_being_fetched():
        print a