Exemple #1
0
def main():

  NUM_THREADS = 4
  if (len(sys.argv) <= 2)  :
    print("usage is domain-pattern seed-url  [max-num-pages-visited] ")
    print("     -w  domain-pattern")
    print("              | ")
    print("              ^ ")
    print(" Ex:  nist.gov http://math.nist.gov 100 ")
    #print("    -w means to continue from a webcrawl dump  (fed into stdin)")
    print(" ")
    sys.exit(2)

  links_to_visit = set([])
  links_already_dispatched = set([])
  max_num_page_visited = 0     #if 0, then there is no limit


  if (sys.argv[1] == "-w"):    #start from a previous crawl
    print('loading file')
    #process_wg_file(sys.stdin, links_already_dispatched, hash_codes_already_visited, links_to_visit)
    url_matching_pattern = sys.argv[2]
    ###### if resuming index creation, need to add call here ######
    read_index_files()
    ######
  else:
    url_matching_pattern = sys.argv[1]
    starting_url = sys.argv[2]
    links_to_visit.add(starting_url)
  if (len(sys.argv) > 3):
    max_num_page_visited = int(sys.argv[3])

  print("#!#  domain pattern: ", url_matching_pattern)
  print(" ")


  # go crawl the web...
  #
  num_page, num_edges = \
  consumer( sys.stdout, url_matching_pattern, max_num_page_visited, \
    links_to_visit,  links_already_dispatched, hash_codes_already_visited)

############################################
#	add call here to write results of index creation to file DJS Oct 2015
  write_index()

############################################
  
  print("\n[-- DONE --]\n", file=sys.stdout)
  print("read ", num_page,  " pages.", file=sys.stdout)
  print("number of edges : ", num_edges, file=sys.stdout)
Exemple #2
0
def main():

    NUM_THREADS = 4
    if len(sys.argv) <= 2:
        print("usage is domain-pattern seed-url  [max-num-pages-visited] ")
        print("     -w  domain-pattern")
        print("              | ")
        print("              ^ ")
        print(" Ex:  nist.gov http://math.nist.gov 100 ")
        print("    -w means to continue from a webcrawl dump  (fed into stdin)")
        print(" ")
        sys.exit(2)

    links_to_visit = set([])
    links_already_dispatched = set([])
    max_num_page_visited = 0  # if 0, then there is no limit

    if sys.argv[1] == "-w":  # sart from a previous crawl
        process_wg_file(sys.stdin, links_already_dispatched, hash_codes_already_visited, links_to_visit)
        url_matching_pattern = sys.argv[2]
        ###### if resuming index creation, need to add call here ######
    else:
        url_matching_pattern = sys.argv[1]
        starting_url = sys.argv[2]
        links_to_visit.add(starting_url)
    if len(sys.argv) > 3:
        max_num_page_visited = int(sys.argv[3])

    print("#!#  domain pattern: ", url_matching_pattern)
    print(" ")

    # go crawl the web...
    #
    num_page, num_edges = consumer(
        sys.stdout,
        url_matching_pattern,
        max_num_page_visited,
        links_to_visit,
        links_already_dispatched,
        hash_codes_already_visited,
    )

    ############################################
    # 	add call here to write results of index creation to file DJS Oct 2015
    write_index()

    ############################################

    print("\n[-- DONE --]\n", file=sys.stdout)
    print("read ", num_page, " pages.", file=sys.stdout)
    print("number of edges : ", num_edges, file=sys.stdout)
Exemple #3
0
def consumer( filestream, url_matching_pattern, max_num_page_visited,   \
            links_to_visit, links_already_dispatched, \
            hash_codes_already_visited) :

  num_edges = 0
  num_page = 0

  
  while (len(links_to_visit) > 0) and \
    ((max_num_page_visited < 1) or (num_page < max_num_page_visited)):

    # here is where we wait for the producer()
    #
    url = links_to_visit.pop()
    timestamp,canonical_url,page_contents = producer(url, links_already_dispatched)
    # mark canonical links also as "seen" 
    #
    if (url != canonical_url) :
      links_already_dispatched.add(canonical_url)
    
    num_page += 1

    links_to_follow = process_webpage(num_page, timestamp, url, canonical_url, page_contents, links_already_dispatched)
  
    num_edges += len(links_to_follow)
    #print ("consumer: url_matching_pattern =", url_matching_pattern)
    
    ### start of added block ###
    for link in links_to_follow:
      if (link.find(url_matching_pattern) == -1): 
        continue
      if (re.search('\.pdf', link)): 	# kludge to remove pdfs DJS Nov2017
        continue
      if (link not in links_already_dispatched):
        #print ("consumer:link =", link)
        links_to_visit.add(link)
      
    ### end of added block ###
    # original commented out DJS Oct 2015
    #for link in links_to_follow:
        #   if link not in links_already_dispatched:
        #         links_to_visit.add(link)
    if num_page%100 == 0:
      print("Saving index")
      write_index()
  return num_page, num_edges