Beispiel #1
0
    def _a_star_search(self, heuristic_func):
        if self.state == Puzzle.goal_state:
            return [self.state], 0

        # heap of tuple(path_total_cost, path)
        # path_total_cost = path_cost + heuristic
        frontier = Frontier()
        frontier.push((heuristic_func(self.state), [self.state]))
        explored_list = []
        node_gen = 0

        while True:
            if frontier.empty():
                break

            # pop state with min cost from the frontier
            cur_path_total_cost, cur_path = frontier.pop()
            cur_state = cur_path[-1]

            # check lazy delete
            if cur_state in explored_list:
                continue

            # test goal condition
            if cur_state == Puzzle.goal_state:
                return cur_path, node_gen

            # add current state to explored list
            explored_list.append(cur_state)

            # get all neighbours of current state in asc order
            neighbours = Puzzle._get_neighbours(cur_state)

            for neighbour in neighbours:
                if neighbour not in explored_list:
                    # new path to go to a neighbour
                    path_to_neighbour = cur_path.copy()
                    path_to_neighbour.append(neighbour)

                    # calc path_total_cost (include heuristic)
                    path_to_neighbour_total_cost = cur_path_total_cost - heuristic_func(cur_state) \
                                                    + 1 + heuristic_func(neighbour)

                    node_gen += 1

                    # if neighbour already in frontier or not
                    # -> use lazy delete
                    frontier.push(
                        (path_to_neighbour_total_cost, path_to_neighbour))

        return None, node_gen
Beispiel #2
0
]

frontier = Frontier(SEEDS)
crawler = Crawler()

FILE = "/Users/Sun/Documents/IR/Data/HW3/pages/page"
FRONTIER_BACKUP = "/Users/Sun/Documents/IR/Data/HW3/pages/frontier"
# frontier.restore(open(FRONTIER_BACKUP))

crawled = 0
MIN_CRAWL = 35000

purl = None
DOMAIN_TIMESTAMP = {}

while not frontier.empty() and crawled < MIN_CRAWL:
    if crawled % 100 == 0:
        print str(crawled) + " pages crawled   " + str(
            crawled * 100 / MIN_CRAWL) + '%'
        ofile = open(FILE + str(crawled / 100), 'a')
        frontier.backup(open(FRONTIER_BACKUP, 'w'))

    url, inlinks = frontier.next_url()
    domain = Page.domain(url)
    now = datetime.datetime.now()
    print "Fetching " + url
    if domain in DOMAIN_TIMESTAMP:
        elasp = now - DOMAIN_TIMESTAMP[domain]
        ELASP_IN_SEC = elasp.total_seconds()
        if ELASP_IN_SEC < 1:
            sleep(1 - ELASP_IN_SEC)