def _a_star_search(self, heuristic_func): if self.state == Puzzle.goal_state: return [self.state], 0 # heap of tuple(path_total_cost, path) # path_total_cost = path_cost + heuristic frontier = Frontier() frontier.push((heuristic_func(self.state), [self.state])) explored_list = [] node_gen = 0 while True: if frontier.empty(): break # pop state with min cost from the frontier cur_path_total_cost, cur_path = frontier.pop() cur_state = cur_path[-1] # check lazy delete if cur_state in explored_list: continue # test goal condition if cur_state == Puzzle.goal_state: return cur_path, node_gen # add current state to explored list explored_list.append(cur_state) # get all neighbours of current state in asc order neighbours = Puzzle._get_neighbours(cur_state) for neighbour in neighbours: if neighbour not in explored_list: # new path to go to a neighbour path_to_neighbour = cur_path.copy() path_to_neighbour.append(neighbour) # calc path_total_cost (include heuristic) path_to_neighbour_total_cost = cur_path_total_cost - heuristic_func(cur_state) \ + 1 + heuristic_func(neighbour) node_gen += 1 # if neighbour already in frontier or not # -> use lazy delete frontier.push( (path_to_neighbour_total_cost, path_to_neighbour)) return None, node_gen
] frontier = Frontier(SEEDS) crawler = Crawler() FILE = "/Users/Sun/Documents/IR/Data/HW3/pages/page" FRONTIER_BACKUP = "/Users/Sun/Documents/IR/Data/HW3/pages/frontier" # frontier.restore(open(FRONTIER_BACKUP)) crawled = 0 MIN_CRAWL = 35000 purl = None DOMAIN_TIMESTAMP = {} while not frontier.empty() and crawled < MIN_CRAWL: if crawled % 100 == 0: print str(crawled) + " pages crawled " + str( crawled * 100 / MIN_CRAWL) + '%' ofile = open(FILE + str(crawled / 100), 'a') frontier.backup(open(FRONTIER_BACKUP, 'w')) url, inlinks = frontier.next_url() domain = Page.domain(url) now = datetime.datetime.now() print "Fetching " + url if domain in DOMAIN_TIMESTAMP: elasp = now - DOMAIN_TIMESTAMP[domain] ELASP_IN_SEC = elasp.total_seconds() if ELASP_IN_SEC < 1: sleep(1 - ELASP_IN_SEC)