Beispiel #1
0
    def pick_clusters(self, clusters, nonleaves):
        """
    """
        _logger.debug("compute initial cluster errors. %d clusters",
                      len(clusters))
        start = time.time()
        for c in clusters:
            c.error = self.influence_cluster(c)
            c.c_range = list(self.c_range)
            c.inf_func = self.create_inf_func(c)
        self.stats['init_cluster_errors'] = [time.time() - start, 1]

        self.update_status("computing frontier")
        _logger.debug("compute initial frontier")
        frontier, _ = Frontier(self.c_range, 0.001)(clusters)

        ret = list(frontier)
        _logger.debug("get nonleaves containing frontier")
        for nonleaf in nonleaves:
            for c in frontier:
                if nonleaf.contains(c):
                    nonleaf.error = self.influence_cluster(nonleaf)
                    ret.append(nonleaf)
                    break

        self.update_status("expanding frontier (%d rules)" % len(ret))
        _logger.debug("second merger pass")
        return ret
Beispiel #2
0
def main():
    from frontier import Frontier
    from options import parse_command_line

    parse_command_line()

    ft = Frontier([
        ('http://localhost/', 1),
    ])
    Master(ft).start()
    IOLoop.instance().start()
Beispiel #3
0
def greedy(matrix, start, goal):
    """
    Find the path from start to the goal using Greedy Best-first Search Algorithm
    The algorithm is implemented based on the description on Wikipedia:
    https://en.wikipedia.org/wiki/Best-first_search#Greedy_BFS
    Notice: GBFS is suboptimal algorithm, so the solution MAY NOT BE OPTIMAL!
    :param matrix: Search space, as a 2D list
    :param start: Start point, as a tuple
    :param goal: Goal point, as a tuple
    :return: The path (if found) from start to goal, or None
    """
    print('Analytics: start node ' + str(start) + ', goal node ' + str(goal))

    # The set of nodes already evaluated
    visited = set()
    partially_expanded = set()

    # The set of currently discovered nodes that are not evaluated yet.
    # Initially, only the start node is known.
    # frontier is implemented as a priority queue
    frontier = Frontier()
    frontier.add(start, manhattan_dist(start, goal))

    # For each node, which node it can most efficiently be reached from.
    # If a node can be reached from many nodes, came_from will eventually contain the
    # most efficient previous step.
    came_from = {}

    while frontier:
        current, current_distance = frontier.nearest
        if current == goal:
            print('Analytics: ' + str(len(partially_expanded)) +
                  ' expanded nodes out of ' + str(count_nodes(matrix)) +
                  ' nodes , among which ' + str(len(visited)) +
                  ' are fully expanded (all successors evaluated)')
            return reconstruct_path(came_from, current)

        partially_expanded.add(current)
        is_interrupted = False
        for neighbor in expand(current, matrix):
            if neighbor not in visited:
                neighbor_distance = manhattan_dist(neighbor, goal)
                if neighbor not in frontier:  # Discover a new node
                    came_from[neighbor] = current
                    frontier.add(neighbor, neighbor_distance)
                    if current_distance > neighbor_distance:
                        is_interrupted = True
                        break

        if not is_interrupted:
            frontier.pop_nearest()
            visited.add(current)

    return None
Beispiel #4
0
    def __init__(self):
        self.url_waiting_queue = mp.Queue()
        self.url_result_queue = mp.Queue()

        self.config = Config()
        self.frontier = Frontier(self.config)
        self.agents = []
        for id in range(self.config.agentCount):
            self.agents.append(Agent(self.config, id))

        #setting seed url on waiting queue
        self.url_waiting_queue.put(self.config.seedURL)
Beispiel #5
0
    def _a_star_search(self, heuristic_func):
        if self.state == Puzzle.goal_state:
            return [self.state], 0

        # heap of tuple(path_total_cost, path)
        # path_total_cost = path_cost + heuristic
        frontier = Frontier()
        frontier.push((heuristic_func(self.state), [self.state]))
        explored_list = []
        node_gen = 0

        while True:
            if frontier.empty():
                break

            # pop state with min cost from the frontier
            cur_path_total_cost, cur_path = frontier.pop()
            cur_state = cur_path[-1]

            # check lazy delete
            if cur_state in explored_list:
                continue

            # test goal condition
            if cur_state == Puzzle.goal_state:
                return cur_path, node_gen

            # add current state to explored list
            explored_list.append(cur_state)

            # get all neighbours of current state in asc order
            neighbours = Puzzle._get_neighbours(cur_state)

            for neighbour in neighbours:
                if neighbour not in explored_list:
                    # new path to go to a neighbour
                    path_to_neighbour = cur_path.copy()
                    path_to_neighbour.append(neighbour)

                    # calc path_total_cost (include heuristic)
                    path_to_neighbour_total_cost = cur_path_total_cost - heuristic_func(cur_state) \
                                                    + 1 + heuristic_func(neighbour)

                    node_gen += 1

                    # if neighbour already in frontier or not
                    # -> use lazy delete
                    frontier.push(
                        (path_to_neighbour_total_cost, path_to_neighbour))

        return None, node_gen
 def __init__(self):
     self.seed_urls = None
     self.frontier = Frontier()
     self.canonicalizer = Canonicalizer()
     self.all_links = None
     self.crawled_links = set()
     self.count = 0
     self.all_out_links = {}
     self.redirected_map = {}
     self.robots = {}
     self.robots_delay = {}
     self.robots_timer = {}
     self.time_out = 3
     self.total_count = 40000
Beispiel #7
0
    def __init__(self, num_threads, seeds, cont_to_crawl):
        self.num_workers = num_threads
        self.dash = Dashboard(num_threads)
        self.workers = []
        self.frontier = Frontier(num_threads, self.dash)
        self.db = Storage()

        # Create the workers
        for i in range(num_threads):
            self.workers.append(CrawlerThread(i, 'CrawlerThread' + str(i), self.frontier, self.dash))
        # print("Workers created")
        # insert seeds in to serve
        if not cont_to_crawl:
            self.frontier.push_to_serve(seeds, 0)
            # print("seeds pushed")
            self.frontier.distribute()
            # print("seeds distributed")
        else:
            self.frontier.load_to_crawl()
Beispiel #8
0
def reweightbasegrammar(basegrammar,
                        pseudoCounts,
                        filter_depth=None,
                        size=None):
    frontiers = []
    for datum in islice(
            batchloader('train',
                        batchsize=1,
                        compute_sketches=False,
                        filter_depth=filter_depth), size):  #TODO
        #class Task(object):
        #def __init__(self, name, request, examples, features=None, cache=False):
        frontiers.append(
            Frontier([
                FrontierEntry(datum.p,
                              logPrior=basegrammar.logLikelihood(
                                  datum.tp, datum.p),
                              logLikelihood=0)
            ], Task('dummyName', datum.tp, [])))

    return basegrammar.insideOutside(frontiers, pseudoCounts, iterations=1)
Beispiel #9
0
def main():

    corpus = Corpus(
        output="stack_without_dupes/result-{}.csv".format(CORPUS_SIZE))
    frontier = Frontier(corpus,
                        10,
                        8,
                        duplicate_identification=True,
                        verbose=VERBOSE,
                        debug=DEBUG)

    crawler = Crawler(SEEDS,
                      corpus,
                      frontier,
                      corpuse_max_size=CORPUS_SIZE,
                      duplicate_identification=True,
                      verbose=VERBOSE,
                      debug=DEBUG)
    print("Starting at {}".format(datetime.now()))
    crawler.start()
    print("Done at {}".format(datetime.now()))
Beispiel #10
0
    def _pyccg_meanings_to_ec_frontiers(self, pyccg_meanings):
        """
        Ret:
            pyccg_frontiers: dict from task -> Dreamcoder frontiers.
        """
        pyccg_frontiers = {}
        for task in pyccg_meanings:
            if len(pyccg_meanings[task]) > 0:
                frontier_entries = []
                for (meaning, log_prob) in pyccg_meanings[task]:
                    ec_sexpr = self.pyccg_learner.ontology.as_ec_sexpr(meaning)
                    if self.ec_ontology_translation_fn:
                        ec_sexpr = self.ec_ontology_translation_fn(ec_sexpr, is_pyccg_to_ec=True)

                    # Uses the p=1.0 likelihood for programs that solve the task.
                    frontier_entry = FrontierEntry(
                        program=Program.parse(ec_sexpr),
                        logPrior=log_prob, 
                        logLikelihood=0.0)
                    frontier_entries.append(frontier_entry)

                pyccg_frontiers[task] = Frontier(frontier_entries, task)
        return pyccg_frontiers
Beispiel #11
0
def a_star(matrix, start, goal, estimate=manhattan_dist):
    """
    Find the path from start to the goal using Greedy Best-first Search Algorithm
    The algorithm is implemented based on the description on Wikipedia:
    https://en.wikipedia.org/wiki/Best-first_search#Greedy_BFS
    Notice: GBFS is suboptimal algorithm, so the solution MAY NOT BE OPTIMAL!
    :param estimate: Heuristics used in a_star search
    :param matrix: Search space, as a 2D list
    :param start: Start point, as a tuple
    :param goal: Goal point, as a tuple
    :return: The path (if found) from start to goal, or None
    """
    print('Analytics: start node ' + str(start) + ', goal node ' + str(goal))

    # The set of nodes already evaluated
    visited = set()

    # For each node, the cost of getting from the start node to that node.
    # The cost of going from start to start is zero.
    g_score = {start: 0}

    # For each node, the total cost of getting from the start node to the goal
    # by passing by that node. That value is partly known, partly heuristic.
    # For the first node, that value is completely heuristic.
    f_score = {start: estimate(start, goal)}

    # The set of currently discovered nodes that are not evaluated yet.
    # Initially, only the start node is known.
    # frontier is implemented as a priority queue
    frontier = Frontier()
    frontier.add(start, f_score[start])

    # For each node, which node it can most efficiently be reached from.
    # If a node can be reached from many nodes, came_from will eventually contain the
    # most efficient previous step.
    came_from = {}

    while frontier:
        current, current_f_score = frontier.pop_nearest()
        if current == goal:
            print('Analytics: ' + str(len(visited)) +
                  ' expanded nodes, out of ' + str(count_nodes(matrix)) +
                  ' nodes')
            # draw_expanded_nodes(matrix, visited)
            return reconstruct_path(came_from, current)

        visited.add(current)
        for neighbor in expand(current, matrix):
            if neighbor not in visited:
                g_through_current = g_score[
                    current] + 1  # every neighbor has distance 1

                if (neighbor not in frontier
                        or g_through_current < g_score[neighbor]):
                    # Discover a new node or a better path
                    came_from[neighbor] = current
                    g_score[neighbor] = g_through_current
                    f_score[neighbor] = (g_score[neighbor] +
                                         estimate(neighbor, goal))
                    frontier.add(neighbor, f_score[neighbor])

    return None
import atexit
import logging

import sys

from corpus import Corpus
from crawler import Crawler
from frontier import Frontier

if __name__ == "__main__":
    # Configures basic logging
    logging.basicConfig(
        format='%(asctime)s (%(name)s) %(levelname)s %(message)s',
        datefmt='%m/%d/%Y %I:%M:%S %p',
        level=logging.INFO)

    # Instantiates frontier and loads the last state if exists
    frontier = Frontier()
    frontier.load_frontier()

    # Instantiates corpus object with the given cmd arg
    corpus = Corpus(sys.argv[1])

    # Registers a shutdown hook to save frontier state upon unexpected shutdown
    atexit.register(frontier.save_frontier)

    # Instantiates a crawler object and starts crawling
    crawler = Crawler(frontier, corpus)
    crawler.start_crawling()
    crawler.analytics()
Beispiel #13
0
 def __init__(self, space):
     self.__space = space
     self.__frontier = Frontier()
     self.__goal = None
     self.__origin = None
Beispiel #14
0
def a_star_multidots(edges,
                     start: tuple,
                     goals: tuple,
                     estimate=mst_estimator):
    """
    Find the path from start to the goal using Greedy Best-first Search Algorithm
    The algorithm is implemented based on the description on Wikipedia:
    https://en.wikipedia.org/wiki/Best-first_search#Greedy_BFS
    Notice: GBFS is suboptimal algorithm, so the solution MAY NOT BE OPTIMAL!
    :param estimate: Heuristics used in a_star search
    :param edges: Search space, as a 2D list
    :param start: Start point, as a tuple
    :param goals: Goal points, as a set of all dots
    :return: The path (if found) from start to goal, or None
    """
    print('Analytics: start node ' + str(start) + ', dots node ' + str(goals))

    goals_to_indices = {g: i for i, g in enumerate(goals, 2)}

    start = init_state(start, goals)
    if start[0:2] in goals:
        start = mark_visited(start[0:2], goals_to_indices, start)

    # The set of nodes already evaluated
    visited = set()

    # For each node, the cost of getting from the start node to that node.
    # The cost of going from start to start is zero.
    g_score = {start: 0}

    # For each node, the total cost of getting from the start node to the dots
    # by passing by that node. That value is partly known, partly heuristic.
    # For the first node, that value is completely heuristic.
    # f_score = {start: naive_estimator(start, dots_visited[start], goals)}
    f_score = {start: estimate(start, goals, edges)}

    # The set of currently discovered nodes that are not evaluated yet.
    # Initially, only the start node is known.
    # frontier is implemented as a priority queue
    frontier = Frontier()
    frontier.add(start, f_score[start])

    # For each node, which node it can most efficiently be reached from.
    # If a node can be reached from many nodes, came_from will eventually contain the
    # most efficient previous step.
    came_from = {}

    while frontier:
        current, current_f_score = frontier.pop_nearest()
        if current[2:].count(1) == len(current) - 2:
            print('Analytics: ' + str(len(visited)) +
                  ' expanded nodes, out of ' +
                  str(len(edges) * (2**(len(current) - 2))) + ' nodes')
            return reconstruct_path(came_from, current)

        visited.add(current)
        for neighbor in expand_multidots(current, edges):
            if neighbor[0:2] in goals:
                neighbor = mark_visited(neighbor[0:2], goals_to_indices,
                                        neighbor)

            if neighbor not in visited:
                # Subtract 1 here because the edge_maps contains both start and end for
                # the shortest path between dots
                g_through_current = g_score[current] + len(
                    edges[current[0:2]][neighbor[0:2]]) - 1

                if (neighbor not in frontier
                        or g_through_current < g_score[neighbor]):
                    # Discover a new node or a better path

                    came_from[neighbor] = current
                    g_score[neighbor] = g_through_current

                    f_score[neighbor] = (g_score[neighbor] +
                                         estimate(neighbor, goals, edges))
                    frontier.add(neighbor, f_score[neighbor])

    return None
Beispiel #15
0
            if rp.can_fetch("*", url):
                get_urls(driver, frontier, page_id)
        elif is_html:
            # no robots.txt => parse everything :)
            # Write site to database without
            get_urls(driver, frontier, page_id)

        if not frontier.has_urls():
            print(th_num + " sleep")
            time.sleep(10)

    driver.close()


if __name__ == "__main__":
    frontier = Frontier(seed)
    robots = []
    rp = RobotFileParser()
    sp = SitemapParser()
    db = Database(use_database)

    init_sites()
    print(robots)
    start = time.time()

    # Read thread num argument
    thread_num = 1
    print(sys.argv)
    if len(sys.argv) > 1:
        thread_num = int(sys.argv[1])
Beispiel #16
0
 def __init__(self, num_of_workers=1, seed=False, seed_path=None):
     self.n_workers = num_of_workers
     self.frontier = Frontier(seed=seed, seed_path=seed_path)
Beispiel #17
0
    "http://www.dce.harvard.edu",
    "http://hsdm.harvard.edu",
    "http://www.fas.harvard.edu",
    "http://hds.harvard.edu",
    "http://www.gsd.harvard.edu",
    "http://www.gse.harvard.edu",
    "http://www.gsas.harvard.edu",
    "http://www.seas.harvard.edu",
    "https://www.hks.harvard.edu",
    "http://hls.harvard.edu",
    "http://www.radcliffe.harvard.edu",
    "http://hms.harvard.edu",
    "https://www.hsph.harvard.edu"
]

frontier = Frontier(SEEDS)
crawler = Crawler()

FILE = "/Users/Sun/Documents/IR/Data/HW3/pages/page"
FRONTIER_BACKUP = "/Users/Sun/Documents/IR/Data/HW3/pages/frontier"
# frontier.restore(open(FRONTIER_BACKUP))

crawled = 0
MIN_CRAWL = 35000

purl = None
DOMAIN_TIMESTAMP = {}

while not frontier.empty() and crawled < MIN_CRAWL:
    if crawled % 100 == 0:
        print str(crawled) + " pages crawled   " + str(
Beispiel #18
0
 def __init__(self):
     self.count = 0
     self.last_domain = ''
     self.frontier = Frontier()
     self.store = Store()
Beispiel #19
0
from frontier import Frontier
from parser import Parser
from graph import Graph
from pagerank import Ranker
from indexer import Indexer
from scorer import Scorer

frontier = Frontier([
    'http://mysql12.f4.htw-berlin.de/crawl/d01.html',
    'http://mysql12.f4.htw-berlin.de/crawl/d06.html',
    'http://mysql12.f4.htw-berlin.de/crawl/d08.html'
])
parser = Parser()
indexer = Indexer()
web_graph = Graph()

for url in frontier:
    # get outgoing links for the graph and content for tokenization
    body, links_on_page = parser.parse(url)

    # add document to indexer
    indexer.add_document(url, body)

    # build our webgraph
    node = web_graph.get_node(url)
    if node is None:
        node = web_graph.add_node(url)

    for out_link in links_on_page:
        web_graph.add_edge(url, out_link)