class Page(): def __init__(self, link, response): self.link = link self.frontier = Frontier(pop='random') self.response = response self.read_resp = self.response.read() self.soup = BeautifulSoup(self.read_resp) self.body = self.soup.body self.get_links() def __str__(self): return self.link def get_links(self): try: article = self.body.find("div", {"id": "mw-content-text"}) for link in article.findAll('a'): try: if self.is_valid_link(link['href']): self.frontier.append(link['href']) print link['href'] except: link except: "No body" def is_valid_link(self, link): if re.match('/wiki/.*', link) and not ":" in link: return link
class Crawler: ''' This Crawler class will implement crawling the website, get the text the links in page ''' def __init__(self): self.frontier = Frontier() self.count = 0 self.last_domain = '' self.store = Store() def crawl(self): ''' pop a url from frontier and get the header, html, text and out links and push the out links into frontier and insert them into elasticsearch :return: None ''' while True and self.count < MAX_COUNT: url = self.frontier.pop_url() try: current_domain = urlparse(url).netloc if current_domain not in self.frontier.robot_dict and self.frontier.no_robot: self.frontier.add_robot_dict(url) if current_domain in self.frontier.robot_dict and not ( self.frontier.robot_dict[current_domain].can_fetch( '*', url)): continue except Exception, e: print 'current_domain_exception'.format(e) continue print 'current url {}'.format(url) if current_domain == self.last_domain: time.sleep(1) else: self.last_domain = current_domain try: header, raw_html = self.downloader(url) except Exception, e: print 'downloader exception'.format(e) continue try: text, title, links = self.parse_url(url, raw_html) except Exception, e: print e continue
class Crawler: ''' This Crawler class will implement crawling the website, get the text the links in page ''' def __init__(self): self.frontier = Frontier() self.count = 0 self.last_domain = '' self.store = Store() def crawl(self): ''' pop a url from frontier and get the header, html, text and out links and push the out links into frontier and insert them into elasticsearch :return: None ''' while True and self.count < MAX_COUNT: url = self.frontier.pop_url() try: current_domain = urlparse(url).netloc if current_domain not in self.frontier.robot_dict and self.frontier.no_robot: self.frontier.add_robot_dict(url) if current_domain in self.frontier.robot_dict and not (self.frontier.robot_dict[current_domain].can_fetch('*', url)): continue except Exception, e: print 'current_domain_exception'.format(e) continue print 'current url {}'.format(url) if current_domain == self.last_domain: time.sleep(1) else: self.last_domain = current_domain try: header, raw_html = self.downloader(url) except Exception, e: print 'downloader exception'.format(e) continue try: text, title, links = self.parse_url(url, raw_html) except Exception, e: print e continue
def __init__(self): self.seed_urls = None self.frontier = Frontier() self.canonicalizer = Canonicalizer() self.all_links = None self.crawled_links = set() self.count = 0 self.all_out_links = {} self.redirected_map = {} self.robots = {} self.robots_delay = {} self.robots_timer = {} self.time_out = 3 self.total_count = 40000
def pick_clusters(self, clusters, nonleaves): """ """ _logger.debug("compute initial cluster errors. %d clusters", len(clusters)) start = time.time() for c in clusters: c.error = self.influence_cluster(c) c.c_range = list(self.c_range) c.inf_func = self.create_inf_func(c) self.stats['init_cluster_errors'] = [time.time() - start, 1] self.update_status("computing frontier") _logger.debug("compute initial frontier") frontier, _ = Frontier(self.c_range, 0.001)(clusters) ret = list(frontier) _logger.debug("get nonleaves containing frontier") for nonleaf in nonleaves: for c in frontier: if nonleaf.contains(c): nonleaf.error = self.influence_cluster(nonleaf) ret.append(nonleaf) break self.update_status("expanding frontier (%d rules)" % len(ret)) _logger.debug("second merger pass") return ret
def __init__(self, link, response): self.link = link self.frontier = Frontier(pop='random') self.response = response self.read_resp = self.response.read() self.soup = BeautifulSoup(self.read_resp) self.body = self.soup.body self.get_links()
def greedy(matrix, start, goal): """ Find the path from start to the goal using Greedy Best-first Search Algorithm The algorithm is implemented based on the description on Wikipedia: https://en.wikipedia.org/wiki/Best-first_search#Greedy_BFS Notice: GBFS is suboptimal algorithm, so the solution MAY NOT BE OPTIMAL! :param matrix: Search space, as a 2D list :param start: Start point, as a tuple :param goal: Goal point, as a tuple :return: The path (if found) from start to goal, or None """ print('Analytics: start node ' + str(start) + ', goal node ' + str(goal)) # The set of nodes already evaluated visited = set() partially_expanded = set() # The set of currently discovered nodes that are not evaluated yet. # Initially, only the start node is known. # frontier is implemented as a priority queue frontier = Frontier() frontier.add(start, manhattan_dist(start, goal)) # For each node, which node it can most efficiently be reached from. # If a node can be reached from many nodes, came_from will eventually contain the # most efficient previous step. came_from = {} while frontier: current, current_distance = frontier.nearest if current == goal: print('Analytics: ' + str(len(partially_expanded)) + ' expanded nodes out of ' + str(count_nodes(matrix)) + ' nodes , among which ' + str(len(visited)) + ' are fully expanded (all successors evaluated)') return reconstruct_path(came_from, current) partially_expanded.add(current) is_interrupted = False for neighbor in expand(current, matrix): if neighbor not in visited: neighbor_distance = manhattan_dist(neighbor, goal) if neighbor not in frontier: # Discover a new node came_from[neighbor] = current frontier.add(neighbor, neighbor_distance) if current_distance > neighbor_distance: is_interrupted = True break if not is_interrupted: frontier.pop_nearest() visited.add(current) return None
def __init__(self, num_threads, seeds, cont_to_crawl): self.num_workers = num_threads self.dash = Dashboard(num_threads) self.workers = [] self.frontier = Frontier(num_threads, self.dash) self.db = Storage() # Create the workers for i in range(num_threads): self.workers.append(CrawlerThread(i, 'CrawlerThread' + str(i), self.frontier, self.dash)) # print("Workers created") # insert seeds in to serve if not cont_to_crawl: self.frontier.push_to_serve(seeds, 0) # print("seeds pushed") self.frontier.distribute() # print("seeds distributed") else: self.frontier.load_to_crawl()
def main(): from frontier import Frontier from options import parse_command_line parse_command_line() ft = Frontier([ ('http://localhost/', 1), ]) Master(ft).start() IOLoop.instance().start()
def _a_star_search(self, heuristic_func): if self.state == Puzzle.goal_state: return [self.state], 0 # heap of tuple(path_total_cost, path) # path_total_cost = path_cost + heuristic frontier = Frontier() frontier.push((heuristic_func(self.state), [self.state])) explored_list = [] node_gen = 0 while True: if frontier.empty(): break # pop state with min cost from the frontier cur_path_total_cost, cur_path = frontier.pop() cur_state = cur_path[-1] # check lazy delete if cur_state in explored_list: continue # test goal condition if cur_state == Puzzle.goal_state: return cur_path, node_gen # add current state to explored list explored_list.append(cur_state) # get all neighbours of current state in asc order neighbours = Puzzle._get_neighbours(cur_state) for neighbour in neighbours: if neighbour not in explored_list: # new path to go to a neighbour path_to_neighbour = cur_path.copy() path_to_neighbour.append(neighbour) # calc path_total_cost (include heuristic) path_to_neighbour_total_cost = cur_path_total_cost - heuristic_func(cur_state) \ + 1 + heuristic_func(neighbour) node_gen += 1 # if neighbour already in frontier or not # -> use lazy delete frontier.push( (path_to_neighbour_total_cost, path_to_neighbour)) return None, node_gen
def __init__(self): self.url_waiting_queue = mp.Queue() self.url_result_queue = mp.Queue() self.config = Config() self.frontier = Frontier(self.config) self.agents = [] for id in range(self.config.agentCount): self.agents.append(Agent(self.config, id)) #setting seed url on waiting queue self.url_waiting_queue.put(self.config.seedURL)
class Controller: """The Controller of the hole Crawling process""" def __init__(self, num_threads, seeds, cont_to_crawl): self.num_workers = num_threads self.dash = Dashboard(num_threads) self.workers = [] self.frontier = Frontier(num_threads, self.dash) self.db = Storage() # Create the workers for i in range(num_threads): self.workers.append(CrawlerThread(i, 'CrawlerThread' + str(i), self.frontier, self.dash)) # print("Workers created") # insert seeds in to serve if not cont_to_crawl: self.frontier.push_to_serve(seeds, 0) # print("seeds pushed") self.frontier.distribute() # print("seeds distributed") else: self.frontier.load_to_crawl() def run(self): """The main Program""" try: for i in range(self.num_workers): self.workers[i].start() # print("All Workers started") self.saver_to_crawl = PeriodicThread(self.frontier.save_to_crawl, 3600.0) self.saver_to_crawl.start() while True: self.frontier.distribute() except: self.dash.print_frontier_stat("saving before exit") self.frontier.save_to_crawl()
def wake_generative_with_pyccg(self, grammar, tasks, maximumFrontier=None, enumerationTimeout=None, CPUs=None, solver=None, evaluationTimeout=None): """ Dreamcoder wake_generative using PYCCG enumeration to guide exploration. Enumerates from PyCCG with a timeout and blindly from the EC grammar. Updates PyCCG using both sets of discovered meanings. Converts the meanings into EC-style frontiers to be handed off to EC. """ # Enumerate PyCCG meanings and update the word learner. pyccg_meanings = {t : [] for t in tasks} if self.use_pyccg_enum: pyccg_meanings = self._update_pyccg_with_distant_batch(tasks, enumerationTimeout) # Enumerate the remaining tasks using EC-style blind enumeration. unsolved_tasks = [task for task in tasks if len(pyccg_meanings[task]) == 0] fallback_frontiers, fallback_times = [], None if self.use_blind_enum: fallback_frontiers, fallback_times = multicoreEnumeration(grammar, unsolved_tasks, maximumFrontier=maximumFrontier, enumerationTimeout=enumerationTimeout, CPUs=CPUs, solver=solver, evaluationTimeout=evaluationTimeout) # Log enumeration results. print("PyCCG model parsing results") self._describe_pyccg_results(pyccg_meanings) print("Non-language generative model enumeration results:") print(Frontier.describe(fallback_frontiers)) # Update PyCCG model with fallback discovered frontiers. self._update_pyccg_with_supervised_batch(fallback_frontiers) # TODO(catwong, jgauthier): does not yet update. # Convert and consolidate PyCCG meanings and fallback frontiers for handoff to EC. pyccg_frontiers = self._pyccg_meanings_to_ec_frontiers(pyccg_meanings) fallback_frontiers = {frontier.task : frontier for frontier in fallback_frontiers} all_frontiers = {t : pyccg_frontiers[t] if t in pyccg_frontiers else fallback_frontiers[t] for t in tasks} all_times = {t : enumerationTimeout if t in pyccg_frontiers else fallback_times[t] for t in tasks} return list(all_frontiers.values()), all_times
def main(): corpus = Corpus( output="stack_without_dupes/result-{}.csv".format(CORPUS_SIZE)) frontier = Frontier(corpus, 10, 8, duplicate_identification=True, verbose=VERBOSE, debug=DEBUG) crawler = Crawler(SEEDS, corpus, frontier, corpuse_max_size=CORPUS_SIZE, duplicate_identification=True, verbose=VERBOSE, debug=DEBUG) print("Starting at {}".format(datetime.now())) crawler.start() print("Done at {}".format(datetime.now()))
def reweightbasegrammar(basegrammar, pseudoCounts, filter_depth=None, size=None): frontiers = [] for datum in islice( batchloader('train', batchsize=1, compute_sketches=False, filter_depth=filter_depth), size): #TODO #class Task(object): #def __init__(self, name, request, examples, features=None, cache=False): frontiers.append( Frontier([ FrontierEntry(datum.p, logPrior=basegrammar.logLikelihood( datum.tp, datum.p), logLikelihood=0) ], Task('dummyName', datum.tp, []))) return basegrammar.insideOutside(frontiers, pseudoCounts, iterations=1)
def _pyccg_meanings_to_ec_frontiers(self, pyccg_meanings): """ Ret: pyccg_frontiers: dict from task -> Dreamcoder frontiers. """ pyccg_frontiers = {} for task in pyccg_meanings: if len(pyccg_meanings[task]) > 0: frontier_entries = [] for (meaning, log_prob) in pyccg_meanings[task]: ec_sexpr = self.pyccg_learner.ontology.as_ec_sexpr(meaning) if self.ec_ontology_translation_fn: ec_sexpr = self.ec_ontology_translation_fn(ec_sexpr, is_pyccg_to_ec=True) # Uses the p=1.0 likelihood for programs that solve the task. frontier_entry = FrontierEntry( program=Program.parse(ec_sexpr), logPrior=log_prob, logLikelihood=0.0) frontier_entries.append(frontier_entry) pyccg_frontiers[task] = Frontier(frontier_entries, task) return pyccg_frontiers
import datetime import hashlib from colorama import Style from colorama import Fore from colorama import init from tldextract import extract import time import traceback import sys init() # Frontier object for frontier interaction frontier = Frontier() domains = ["gov.si", "evem.gov.si", "e-uprava.gov.si", "e-prostor.gov.si"] urls = ["https://www.gov.si", "http://evem.gov.si/evem/drzavljani/zacetna.evem", "https://e-uprava.gov.si/", "https://www.e-prostor.gov.si/"] allowed_domain = 'gov.si' type_codes = { 'application/msword' : 'doc', 'application/vnd.openxmlformats-officedocument.wordprocessingml.document' : 'docx', 'application/pdf' : 'pdf', 'application/vnd.ms-powerpoint' : 'ppt', 'application/vnd.openxmlformats-officedocument.presentationml.presentation' : 'pptx', 'text/html' : 'html' } request_rate_sec = 5 user_agent = "fri-ieps-nasagrupa"
"http://www.dce.harvard.edu", "http://hsdm.harvard.edu", "http://www.fas.harvard.edu", "http://hds.harvard.edu", "http://www.gsd.harvard.edu", "http://www.gse.harvard.edu", "http://www.gsas.harvard.edu", "http://www.seas.harvard.edu", "https://www.hks.harvard.edu", "http://hls.harvard.edu", "http://www.radcliffe.harvard.edu", "http://hms.harvard.edu", "https://www.hsph.harvard.edu" ] frontier = Frontier(SEEDS) crawler = Crawler() FILE = "/Users/Sun/Documents/IR/Data/HW3/pages/page" FRONTIER_BACKUP = "/Users/Sun/Documents/IR/Data/HW3/pages/frontier" # frontier.restore(open(FRONTIER_BACKUP)) crawled = 0 MIN_CRAWL = 35000 purl = None DOMAIN_TIMESTAMP = {} while not frontier.empty() and crawled < MIN_CRAWL: if crawled % 100 == 0: print str(crawled) + " pages crawled " + str(
class RouteFinder(object): def __init__(self, space): self.__space = space self.__frontier = Frontier() self.__goal = None self.__origin = None def __calculate_distance(self, from_xy, to_xy): """ Calculate the distance between the given coordinates using MATH. :param from_xy: a tuple of the form (x, y) :param to_xy: a tuple of the form (x, y) :returns: float """ diff_x = math.fabs(from_xy[0] - to_xy[0]) diff_y = math.fabs(from_xy[1] - to_xy[1]) return math.sqrt(math.pow(diff_x, 2) + math.pow(diff_y, 2)) def __initialize_goal(self, goal_id): """ Initialise the goal node, identified by the given ID. :param goal_id: the goal id """ self.__goal = self.__space.nodes[goal_id] self.__goal.distance_to_goal = 0 def __initialize_origin(self, origin_id): """ Initialise the origin node, identified by the given ID. :param origin: the origin id """ self.__origin = self.__space.nodes[origin_id] self.__origin.route_distance = 0 def __add_to_frontier(self, from_node, to_node): """ Attempt to add the "to" node to the frontier. :param from_node: the previous node :param to_node: the next node """ if to_node.visited: return if from_node is None: distance_to_node = 0 else: distance_to_node = self.__calculate_distance( (from_node.x, from_node.y), (to_node.x, to_node.y)) + from_node.distance_to_here # Have we found a shorter route to this node? if distance_to_node < to_node.distance_to_here: to_node.distance_to_here = distance_to_node to_node.via = from_node if math.isinf(to_node.distance_to_goal): # Always low-ball to estimated distance, otherwise A* won't work to_node.distance_to_goal = self.__calculate_distance( (to_node.x, to_node.y), (self.__goal.x, self.__goal.y)) * 0.9 self.__frontier.add_or_update(to_node.id, to_node.total_distance_to_goal) def __get_route_ids(self): """ Get the IDs of the best route from the origin to the goal. :returns: a list of node IDs """ route = [] node = self.__goal while node is not None: route.append(node.id) node = node.via return list(reversed(route)) def __find_route(self): """ Find the shortest route between the origin and the goal. :returns: a list of IDs representing the steps from the origin to the goal, or None """ active_node = self.__space.nodes[self.__frontier.remove()] # Exit conditions if active_node is None: return None if active_node == self.__goal: return self.__get_route_ids() active_node.visited = True # Add the linked nodes to the frontier for linked_id in self.__space.find_linked_nodes(active_node.id): if linked_id not in self.__space.nodes: # Something is awry with the map data return None self.__add_to_frontier(active_node, self.__space.nodes[linked_id]) return self.__find_route() def find_route(self, origin_id, goal_id): """ Find the shortest route between the specified nodes. :param origin_id: the ID of the "origin" node :param goal_id: the ID of the "goal" node :returns: a list of IDs representing the steps from the origin to the goal, or None """ if origin_id not in self.__space.nodes or goal_id not in self.__space.nodes: return None if origin_id == goal_id: return [origin_id] self.__space.reset() self.__frontier.reset() self.__initialize_origin(origin_id) self.__initialize_goal(goal_id) self.__add_to_frontier(None, self.__origin) return self.__find_route()
def __init__(self, space): self.__space = space self.__frontier = Frontier() self.__goal = None self.__origin = None
class Crawler: def __init__(self): self.seed_urls = None self.frontier = Frontier() self.canonicalizer = Canonicalizer() self.all_links = None self.crawled_links = set() self.count = 0 self.all_out_links = {} self.redirected_map = {} self.robots = {} self.robots_delay = {} self.robots_timer = {} self.time_out = 3 self.total_count = 40000 def initialize(self, seed_urls): self.all_links = set(seed_urls) self.seed_urls = seed_urls self.frontier.initialize(seed_urls) def crawl_control(self): file_io.initialize_log() current_wave = 0 while True: # if empty, move to next wave if self.frontier.is_empty(): self.frontier.change_wave(current_wave + 1) # if still empty, finished if self.frontier.is_empty(): self.finish() return "Finished" current_wave, score, url = self.frontier.frontier_pop() # get protocol, domain domain = self.canonicalizer.get_domain(url) # check robots.txt if domain not in self.robots: try: robots = Robots("http://" + domain + "/robots.txt") self.robots[domain] = robots if robots.delay > self.time_out: self.robots_delay[domain] = self.time_out else: self.robots_delay[domain] = robots.delay self.robots_timer[domain] = datetime.now() except Exception as e: error = "Read robots.txt error:\n{0}\nError: {1}\n\n".format( "http://" + domain + "/robots.txt", e) file_io.write_error_info(error) continue delay = self.robots_delay[domain] # check if can fetch if not self.robots[domain].can_fetch(url): not_allowed = "Not Allowed: {}\n".format(url) print(not_allowed) file_io.write_not_allowed(not_allowed) continue else: # politeness since_last_crawl = datetime.now() - self.robots_timer[domain] if since_last_crawl.total_seconds() < delay: time.sleep(delay - since_last_crawl.total_seconds()) print("Current: " + url) file_io.write_current_link(url) # print time interval # print((datetime.now() - self.robots_timer[domain]).total_seconds()) # get page header try: url_head = self.get_head(url) if url_head.status_code == 404: error = "Status error:\n{0}\nError code: {1}\n\n".format( url, url_head.status_code) file_io.write_error_info(error) continue except Exception as e: error = "Read head error:\n{0}\nError: {1}\n\n".format( url, e) file_io.write_error_info(error) self.robots_timer[domain] = datetime.now() continue header = dict(url_head.headers) # get content type if "content-type" in url_head.headers: content_type = url_head.headers["content-type"] else: content_type = "text/html" # crawl html type if "text/html" not in content_type: continue else: # read page try: soup, raw_html, base_url, lang = self.get_page(url) self.robots_timer[domain] = datetime.now() # whether we should crawl, language, black list if not self.page_should_crawl(base_url, lang): continue # multiple redirected url if base_url in self.crawled_links: self.frontier.objects[base_url].in_links.update( self.frontier.objects[url].in_links) error = "Multiple redirected URL:\nURL: {0}\nRedirected URL: {1}\n\n".format( url, base_url) file_io.write_error_info(error) continue else: self.crawled_links.add(base_url) frontier_item = FrontierItem(base_url) frontier_item.in_links = self.frontier.objects[ url].in_links self.frontier.objects[base_url] = frontier_item self.redirected_map[url] = base_url except Exception as e: error = "Read page error:\n{0}\nError: {1}\n\n".format( url, e) file_io.write_error_info(error) self.robots_timer[domain] = datetime.now() continue raw_out_links = self.get_out_links(soup) out_links = [] # write as ap format text = self.extract_text(soup) if len(soup.select("title")) != 0: title = soup.select("title")[0].get_text() else: title = None file_io.write_ap(base_url, text, header, title) file_io.write_raw_html({base_url: raw_html}) for link in raw_out_links: processed_link = self.canonicalizer.canonicalize( base_url, domain, link) file_io.write_canonicalization(link, processed_link) # if link is not empty if len(processed_link) != 0: out_links.append(processed_link) if processed_link not in self.all_links: # new frontier item frontier_item = FrontierItem( processed_link, link) frontier_item.update_in_links(base_url) self.frontier.frontier_put( frontier_item, current_wave + 1) self.all_links.add(processed_link) else: # update in links if processed_link in self.redirected_map: redirected = self.redirected_map[ processed_link] self.frontier.frontier_update_inlinks( redirected, base_url) else: self.frontier.frontier_update_inlinks( processed_link, base_url) file_io.write_all_out_links({base_url: out_links}) self.count += 1 print(self.count, current_wave, url, score) file_io.write_log(self.count, current_wave, url, score) file_io.write_final_info(len(self.crawled_links), len(self.all_links)) if self.count == self.total_count: self.finish() print("Finished") return def finish(self): for url in self.crawled_links: file_io.write_crawled_links(url) file_io.write_all_in_links( {url: list(self.frontier.objects[url].in_links)}) file_io.write_all_links(self.all_links) def get_out_links(self, soup): a = soup.select('a') out_links = [] for item in a: if item.get('href'): out_links.append(item['href']) return out_links def get_page(self, url: str): headers = {"Connection": "close"} res = requests.get(url=url, headers=headers, timeout=self.time_out) soup = BeautifulSoup(res.text, "lxml") try: if soup.select("html")[0].has_attr("lang"): lang = soup.select("html")[0]['lang'] else: lang = "en" except Exception as e: error = "Read language error:\n{0}\nError: {1}\n\n".format(url, e) file_io.write_error_info(error) lang = "en" base_url = res.url return soup, res.text, base_url, lang def get_head(self, url: str): headers = {"Connection": "close"} head = requests.head(url=url, headers=headers, timeout=self.time_out, allow_redirects=True) return head def extract_text(self, soup: BeautifulSoup): output = "" text = soup.find_all("p") for t in text: new_t = t.get_text() new_t = re.sub("\n", "", new_t) new_t = re.sub(" +", " ", new_t) if len(new_t) == 0: continue output += "{} ".format(new_t) return output def page_should_crawl(self, base_url, lang): result = True # check language if "en" not in lang.lower(): error = "Language error: {0}\nLanguage = {1}\n\n".format( base_url, lang) file_io.write_error_info(error) result = False # check black list black_list = [ ".jpg", ".svg", ".png", ".pdf", ".gif", "youtube", "edit", "footer", "sidebar", "cite", "special", "mailto", "books.google", "tel:", "javascript", "www.vatican.va", ".ogv", "amazon", ".webm" ] block = 0 key = "" for key in black_list: if key in base_url.lower(): block = 1 break if block == 1: error = "Page type error: {0}\nkeyword = {1}\n\n".format( base_url, key) file_io.write_error_info(error) result = False return result
if rp.can_fetch("*", url): get_urls(driver, frontier, page_id) elif is_html: # no robots.txt => parse everything :) # Write site to database without get_urls(driver, frontier, page_id) if not frontier.has_urls(): print(th_num + " sleep") time.sleep(10) driver.close() if __name__ == "__main__": frontier = Frontier(seed) robots = [] rp = RobotFileParser() sp = SitemapParser() db = Database(use_database) init_sites() print(robots) start = time.time() # Read thread num argument thread_num = 1 print(sys.argv) if len(sys.argv) > 1: thread_num = int(sys.argv[1])
import atexit import logging from crawler import Crawler from frontier import Frontier if __name__ == "__main__": # Configures basic logging logging.basicConfig( format='%(asctime)s (%(name)s) %(levelname)s %(message)s', datefmt='%m/%d/%Y %I:%M:%S %p', level=logging.INFO) # Instantiates frontier and loads the last state if exists frontier = Frontier() frontier.load_frontier() # Registers a shutdown hook to save frontier state upon unexpected shutdown atexit.register(frontier.save_frontier) # Instantiates a crawler object and starts crawling crawler = Crawler(frontier) crawler.start_crawling() frontier.data_dump() crawler.data_dump2()
from frontier import Frontier from parser import Parser from graph import Graph from pagerank import Ranker from indexer import Indexer from scorer import Scorer frontier = Frontier([ 'http://mysql12.f4.htw-berlin.de/crawl/d01.html', 'http://mysql12.f4.htw-berlin.de/crawl/d06.html', 'http://mysql12.f4.htw-berlin.de/crawl/d08.html' ]) parser = Parser() indexer = Indexer() web_graph = Graph() for url in frontier: # get outgoing links for the graph and content for tokenization body, links_on_page = parser.parse(url) # add document to indexer indexer.add_document(url, body) # build our webgraph node = web_graph.get_node(url) if node is None: node = web_graph.add_node(url) for out_link in links_on_page: web_graph.add_edge(url, out_link)
from frontier import Frontier from node import Node frontier = Frontier() frontier.add_or_update('a', 40) frontier.add_or_update('b', 20) frontier.add_or_update('c', 30) assert frontier.remove() == 'b' assert frontier.remove() == 'c' frontier.add_or_update('d', 10) frontier.add_or_update('e', 50) assert frontier.remove() == 'd' assert frontier.remove() == 'a' assert frontier.remove() == 'e' assert frontier.remove() is None
import atexit import logging import sys from corpus import Corpus from crawler import Crawler from frontier import Frontier if __name__ == "__main__": # Configures basic logging logging.basicConfig( format='%(asctime)s (%(name)s) %(levelname)s %(message)s', datefmt='%m/%d/%Y %I:%M:%S %p', level=logging.INFO) # Instantiates frontier and loads the last state if exists frontier = Frontier() frontier.load_frontier() # Instantiates corpus object with the given cmd arg corpus = Corpus(sys.argv[1]) # Registers a shutdown hook to save frontier state upon unexpected shutdown atexit.register(frontier.save_frontier) # Instantiates a crawler object and starts crawling crawler = Crawler(frontier, corpus) crawler.start_crawling() crawler.analytics()
def __init__(self, num_of_workers=1, seed=False, seed_path=None): self.n_workers = num_of_workers self.frontier = Frontier(seed=seed, seed_path=seed_path)
def __init__(self): self.frontier = Frontier() self.count = 0 self.last_domain = '' self.store = Store()
class Crawler: ''' crawling the website, get the text the links in page ''' def __init__(self): self.count = 0 self.last_domain = '' self.frontier = Frontier() self.store = Store() def initial_seeds(self): self.frontier.initial_queue() def parseRobot(self, domain): robot_url = 'http://' + domain + '/robots.txt' try: robot_file = urllib2.urlopen(robot_url).read() robot_content = '' for l in robot_file.split('\n'): if l.replace(' ','') != '': robot_content += l + '\n' robot_parser = robotexclusionrulesparser.RobotExclusionRulesParser() robot_parser.parse(robot_content) try: crawler_delay = robot_parser.get_crawl_delay('*') except Exception as e: ## print 'crawler_delay exception: {}'.format(e) crawler_delay = None return robot_parser, crawler_delay except Exception as e: ## print 'robot parse exception: {}'.format(e) return None, None def crawl(self): ''' pop a url from frontier and get the header, html, text and out links. push the out links into frontier and insert them into elasticsearch ''' while self.count < MAX_COUNT: level, url = self.frontier.pop_url() try: current_domain = urlparse(url).netloc ## if current_domain not in self.frontier.robot_dict and self.frontier.no_robot: ## self.frontier.add_robot_dict(url) ## ## if current_domain in self.frontier.robot_dict and not (self.frontier.robot_dict[current_domain].can_fetch('*', url)): ## continue robot_parser, crawler_delay = self.parseRobot(current_domain) if robot_parser is not None: if not robot_parser.is_allowed('*', url): print 'not allowed to crawl: {}'.format(url) continue if crawler_delay is not None: time.sleep(crawler_delay) except Exception as e: print 'current_domain_exception: {}'.format(e) print url continue if current_domain == self.last_domain: time.sleep(1) else: self.last_domain = current_domain try: header, raw_html = self.downloader(url) except Exception, e: print 'downloader exception: {}'.format(e) continue try: text, title, links = self.parse_url(url, raw_html) except Exception as e: print 'parse exception: {}'.format(e) continue if text or links: self.count += 1 out_links = [] for link in links: try: if len(self.frontier.pq) > MAX_COUNT: break if self.frontier.check_push_url(link, url): out_links.append(link) except Exception as e: continue print 'FINISHED: {}'.format(self.count) self.store.insert(self.count, url, header, title, text, raw_html, [], out_links, level) self.write_to_file(self.count, url, header, title, text, raw_html, out_links, level) else: continue self.frontier.write_in_links() self.store.write_urls()
def __init__(self): self.count = 0 self.last_domain = '' self.frontier = Frontier() self.store = Store()
def a_star(matrix, start, goal, estimate=manhattan_dist): """ Find the path from start to the goal using Greedy Best-first Search Algorithm The algorithm is implemented based on the description on Wikipedia: https://en.wikipedia.org/wiki/Best-first_search#Greedy_BFS Notice: GBFS is suboptimal algorithm, so the solution MAY NOT BE OPTIMAL! :param estimate: Heuristics used in a_star search :param matrix: Search space, as a 2D list :param start: Start point, as a tuple :param goal: Goal point, as a tuple :return: The path (if found) from start to goal, or None """ print('Analytics: start node ' + str(start) + ', goal node ' + str(goal)) # The set of nodes already evaluated visited = set() # For each node, the cost of getting from the start node to that node. # The cost of going from start to start is zero. g_score = {start: 0} # For each node, the total cost of getting from the start node to the goal # by passing by that node. That value is partly known, partly heuristic. # For the first node, that value is completely heuristic. f_score = {start: estimate(start, goal)} # The set of currently discovered nodes that are not evaluated yet. # Initially, only the start node is known. # frontier is implemented as a priority queue frontier = Frontier() frontier.add(start, f_score[start]) # For each node, which node it can most efficiently be reached from. # If a node can be reached from many nodes, came_from will eventually contain the # most efficient previous step. came_from = {} while frontier: current, current_f_score = frontier.pop_nearest() if current == goal: print('Analytics: ' + str(len(visited)) + ' expanded nodes, out of ' + str(count_nodes(matrix)) + ' nodes') # draw_expanded_nodes(matrix, visited) return reconstruct_path(came_from, current) visited.add(current) for neighbor in expand(current, matrix): if neighbor not in visited: g_through_current = g_score[ current] + 1 # every neighbor has distance 1 if (neighbor not in frontier or g_through_current < g_score[neighbor]): # Discover a new node or a better path came_from[neighbor] = current g_score[neighbor] = g_through_current f_score[neighbor] = (g_score[neighbor] + estimate(neighbor, goal)) frontier.add(neighbor, f_score[neighbor]) return None
def a_star_multidots(edges, start: tuple, goals: tuple, estimate=mst_estimator): """ Find the path from start to the goal using Greedy Best-first Search Algorithm The algorithm is implemented based on the description on Wikipedia: https://en.wikipedia.org/wiki/Best-first_search#Greedy_BFS Notice: GBFS is suboptimal algorithm, so the solution MAY NOT BE OPTIMAL! :param estimate: Heuristics used in a_star search :param edges: Search space, as a 2D list :param start: Start point, as a tuple :param goals: Goal points, as a set of all dots :return: The path (if found) from start to goal, or None """ print('Analytics: start node ' + str(start) + ', dots node ' + str(goals)) goals_to_indices = {g: i for i, g in enumerate(goals, 2)} start = init_state(start, goals) if start[0:2] in goals: start = mark_visited(start[0:2], goals_to_indices, start) # The set of nodes already evaluated visited = set() # For each node, the cost of getting from the start node to that node. # The cost of going from start to start is zero. g_score = {start: 0} # For each node, the total cost of getting from the start node to the dots # by passing by that node. That value is partly known, partly heuristic. # For the first node, that value is completely heuristic. # f_score = {start: naive_estimator(start, dots_visited[start], goals)} f_score = {start: estimate(start, goals, edges)} # The set of currently discovered nodes that are not evaluated yet. # Initially, only the start node is known. # frontier is implemented as a priority queue frontier = Frontier() frontier.add(start, f_score[start]) # For each node, which node it can most efficiently be reached from. # If a node can be reached from many nodes, came_from will eventually contain the # most efficient previous step. came_from = {} while frontier: current, current_f_score = frontier.pop_nearest() if current[2:].count(1) == len(current) - 2: print('Analytics: ' + str(len(visited)) + ' expanded nodes, out of ' + str(len(edges) * (2**(len(current) - 2))) + ' nodes') return reconstruct_path(came_from, current) visited.add(current) for neighbor in expand_multidots(current, edges): if neighbor[0:2] in goals: neighbor = mark_visited(neighbor[0:2], goals_to_indices, neighbor) if neighbor not in visited: # Subtract 1 here because the edge_maps contains both start and end for # the shortest path between dots g_through_current = g_score[current] + len( edges[current[0:2]][neighbor[0:2]]) - 1 if (neighbor not in frontier or g_through_current < g_score[neighbor]): # Discover a new node or a better path came_from[neighbor] = current g_score[neighbor] = g_through_current f_score[neighbor] = (g_score[neighbor] + estimate(neighbor, goals, edges)) frontier.add(neighbor, f_score[neighbor]) return None