def greedy_approx(G): """ Return MST of the given undirected graph""" vis = set() tot_weight = 0 pq = PQDict() path = [] '''Initialize Priority Queue which will help us find Farthest node after distance is calcualted from visited node''' for node in G.nodes(): pq.additem(node, float("-inf")) curr = pq.pop() vis.add(curr) path.append(curr) while len(pq) > 0: for s, nod, wt in G.edges(curr, data=True): '''Distance calculation''' if nod not in vis and -wt['weight'] > pq[nod]: pq.updateitem(nod, -wt['weight']) if len(pq) > 0: ''' Selection Step''' top = pq.top() vis.add(top) curr = pq.pop() ''' Insertion Step''' loc, cost = minCost(G, path, top) '''Insert into the location found by minCost()''' path.insert(loc, top) tot_weight += cost return path, tot_weight
def primMST(G): """ Return MST of the given undirected graph""" vis = set() tot_weight = 0 pq = PQDict() Gprime = nx.Graph() ''' Add all nodes to PQDict with infinite distance''' for node in G.nodes(): pq.additem(node, float("inf")) curr = pq.pop() #Select initial node vis.add(curr) while len(pq) > 0: for s, nod, wt in G.edges(curr, data=True): if nod not in vis and wt['weight'] < pq[nod]: pq.updateitem(nod, wt['weight']) if len(pq) > 0: top = pq.top() source, destination, dist = [ data for data in sorted(G.edges(top, data=True), key=lambda (source, target, data): data['weight']) if data[1] in vis ][0] Gprime.add_edge(source, destination, weight=dist['weight']) vis.add(top) tot_weight += pq[top] curr = pq.pop() return Gprime, tot_weight
def primMST(G): """ Return MST of the given undirected graph""" vis = set() tot_weight = 0 pq = PQDict() Gprime = nx.Graph() ''' Add all nodes to PQDict with infinite distance''' for node in G.nodes(): pq.additem(node, float("inf")) curr = pq.pop() #Select initial node vis.add(curr) while len(pq) > 0: for s,nod, wt in G.edges(curr, data=True): if nod not in vis and wt['weight'] < pq[nod]: pq.updateitem(nod, wt['weight']) if len(pq)>0: top = pq.top() source,destination, dist = [data for data in sorted(G.edges(top, data=True), key=lambda (source,target,data): data['weight']) if data[1] in vis][0] Gprime.add_edge(source, destination, weight = dist['weight']) vis.add(top) tot_weight += pq[top] curr = pq.pop() return Gprime, tot_weight
def greedy_approx(G): """ Return MST of the given undirected graph""" vis = set() tot_weight = 0 pq = PQDict() path = [] '''Initialize Priority Queue which will help us find Farthest node after distance is calcualted from visited node''' for node in G.nodes(): pq.additem(node, float("-inf")) curr = pq.pop() vis.add(curr) path.append(curr) while len(pq) > 0: for s,nod, wt in G.edges(curr, data=True): '''Distance calculation''' if nod not in vis and -wt['weight'] > pq[nod]: pq.updateitem(nod, -wt['weight']) if len(pq)>0: ''' Selection Step''' top = pq.top() vis.add(top) curr = pq.pop() ''' Insertion Step''' loc,cost = minCost(G,path,top) '''Insert into the location found by minCost()''' path.insert(loc, top) tot_weight += cost return path,tot_weight
def a_star(self, heuristic): node = self.tree.create_node(state=State(self.wrigglers), pathCost=0) node.heuristic = heuristic(node) frontier = PQDict() stateFrontier = {} explored = {} # Sacrifice memory to have a huge speed up being able to instantly check for state in frontier stateFrontier[str(node.state)] = node.heuristic frontier.additem(node._identifier, node.heuristic) while(True): if(len(frontier) == 0): return None nodeID = frontier.popitem()[0] node = self.tree.get_node(nodeID) nodeStateStr = str(node.state) del stateFrontier[nodeStateStr] if self.testGoal(node.state): return node explored[nodeStateStr] = -1 # we don't care what the hash matches actions = self.getActions(node.state) for action in actions: child = self.childNode(node, action) child.heuristic = heuristic(child) childStr = str(child.state) inExplored = False inFrontier = False if childStr in explored: inExplored = True bGreater = False if childStr in stateFrontier: if(stateFrontier[childStr] < child.heuristic + child.pathCost): bGreater = True inFrontier = True if(not inExplored and not inFrontier): stateFrontier[childStr] = child.heuristic frontier.additem(child._identifier, child.heuristic + child.pathCost) elif(bGreater): bHappened = False for key in frontier: if(str(self.tree.get_node(key).state) == childStr): bHappened = True frontier.pop(key) frontier.additem(child._identifier, child.heuristic + child.pathCost) break assert bHappened
def test_pop(self): # pop selected item - return pkey pq = PQDict(A=5, B=8, C=1) pkey = pq.pop('B') self.assertEqual(pkey, 8) pq.pop('A') pq.pop('C') self.assertRaises(KeyError, pq.pop, 'A') self.assertRaises(KeyError, pq.pop, 'does_not_exist') # no args and empty - throws self.assertRaises(KeyError, pq.pop) #pq is now empty # no args - return top dkey pq = PQDict(A=5, B=8, C=1) self.assertEqual(pq.pop(), 'C')
def primWeight(G): """ Return MST of the given undirected graph""" vis = set() tot_weight = 0 pq = PQDict() for node in G.nodes(): pq.additem(node, float("inf")) curr = pq.pop() vis.add(curr) while len(pq) > 0: for s,nod, wt in G.edges(curr, data=True): if nod not in vis and wt['weight'] < pq[nod]: pq.updateitem(nod, wt['weight']) if len(pq)>0: top = pq.top() vis.add(top) tot_weight += pq[top] curr = pq.pop() return tot_weight
def primWeight(G): """ Return MST of the given undirected graph""" vis = set() tot_weight = 0 pq = PQDict() for node in G.nodes(): pq.additem(node, float("inf")) curr = pq.pop() vis.add(curr) while len(pq) > 0: for s, nod, wt in G.edges(curr, data=True): if nod not in vis and wt['weight'] < pq[nod]: pq.updateitem(nod, wt['weight']) if len(pq) > 0: top = pq.top() vis.add(top) tot_weight += pq[top] curr = pq.pop() return tot_weight
class Storage: """ Kademlia storage implementation. Three responsibilities: - Storing data - Listing old keys to refresh - Keeping a record of data popularity and evicting unpopular data when a storage limit is reached. """ implements(kademlia.storage.IStorage) max_len = 5000 def __init__(self, args, ttl=604800, time=time): self.args = args self.time = time # linked self.popularity_queue = PQDict() self.age_dict = OrderedDict() # separate self.future_popularity_queue = PQDict() self.step = ttl def cull(self): if len(self.popularity_queue) > self.max_len: key = self.popularity_queue.pop() if self.args.verbose: log_info('Dropping key {} (over count {})'.format(binascii.hexlify(key), self.max_len)) del self.age_dict[key] if len(self.future_popularity_queue) > self.max_len: key = self.future_popularity_queue.pop() if self.args.verbose: log_info('Dropping future key {} (over count {})'.format(binascii.hexlify(key), self.max_len)) def inc_popularity(self, key): current = self.popularity_queue.get(key) if current is not None: self.popularity_queue[key] = current + self.step else: current = self.future_popularity_queue.get(key, self.time.time()) self.future_popularity_queue[key] = current + self.step def _tripleIterable(self): ikeys = self.age_dict.iterkeys() ibirthday = imap(operator.itemgetter(0), self.age_dict.itervalues()) ivalues = imap(operator.itemgetter(1), self.age_dict.itervalues()) return izip(ikeys, ibirthday, ivalues) # interface methods below def __setitem__(self, key, value): age, oldvalue = self.age_dict.get(key) or self.time.time(), None if not validate(self.args, key, value, oldvalue)[0]: return if oldvalue is not None: self.age_dict[key] = (age, value) else: age = self.future_popularity_queue.pop(key, self.time.time()) self.age_dict[key] = (self.time.time(), value) self.popularity_queue[key] = age self.cull() def __getitem__(self, key): self.inc_popularity(key) self.cull() return self.age_dict[key][1] def get(self, key, default=None): self.inc_popularity(key) self.cull() if key in self.age_dict: return self.age_dict[key][1] return default def iteritemsOlderThan(self, secondsOld): minBirthday = self.time.time() - secondsOld zipped = self._tripleIterable() matches = takewhile(lambda r: minBirthday >= r[1], zipped) return imap(operator.itemgetter(0, 2), matches) def iteritems(self): self.cull() return self.age_dict.iteritems()
class Crawler(): def __init__(self): self.query = input("Enter search query: ") self.webpages_limit = input( "Set total number of webpages to be crawled: ") self.limit = input( "Set limits on how many webpages be crawled from single site: ") self.priority_queue = PQDict().maxpq() self.queue = queue.Queue() self.downloader = Downloader() self.parser = Parser(self.query) self.calculator = Calculator(self.query) self.relevance = Relevance() self.webpages_crawled = 0 self.logger = logging.getLogger(__name__) self.visited_urls = set() self.sites_times = {} #fetch top 10 results from google search: def __fetch_google_results(self): service = build("customsearch", "v1", developerKey=API_KEY) res = service.cse().list(q=self.query, cx=SEARCH_ENGINE_ID).execute() return res #enqueue the 10 google search results def enqueue_seeds(self): res = self.__fetch_google_results() for item in res['items']: self.priority_queue.additem(item['link'], 10) self.queue.put(item['link']) self.logger.debug("Enqueued: " + item['link']) #check has this url been visited before #and has it reach the limit of each site #and Robot Exclusion Protocols def urlchecker(self, url): if url is None: return False normalized_url = urltools.normalize(url) robotparser = urllib.robotparser.RobotFileParser() try: url_comp = urlparse(normalized_url) base_url = url_comp.scheme + "://" + url_comp.netloc + "/" except: self.logger.error("Cannot parse: " + url) try: robotparser.set_url(base_url + "robots.txt") robotparser.read() if not robotparser.can_fetch("*", normalized_url): self.logger.error(url + " is excluded due to protocol") return False except: self.logger.error("Cannot determine robots exclusion protocol: " + url) if normalized_url in self.visited_urls: self.logger.debug(url + " Has been visited before! ") return False elif base_url in self.sites_times and self.sites_times[base_url] > int( self.limit): # self.logger.debug( url + " Times visiting this site have reach the limit ") return False elif 'cgi' in normalized_url: return False else: return True #the crawling process def crawl(self): try: harvest_rate_accum = 0 while self.webpages_crawled < int(self.webpages_limit): print(self.webpages_crawled) try: url = self.priority_queue.pop() except e: print("cannot pop") print(url) if self.urlchecker(url): try: content = self.downloader.download(url).decode('utf-8') if content is not None: self.webpages_crawled += 1 rel = self.relevance.relevance(content, self.query) harvest_rate_accum += rel self.crawled_log(" Harvest rate: " + str(harvest_rate_accum / self.webpages_crawled)) except: print("Failed in downloading") normalized_url = urltools.normalize(url) try: url_comp = urlparse(normalized_url) base_url = url_comp.scheme + "://" + url_comp.netloc + "/" except: self.logger.error("Cannot parse: " + url) if base_url in self.sites_times: self.sites_times[base_url] += 1 else: self.sites_times[base_url] = 1 self.visited_urls.add(normalized_url) if rel < 0.2: continue for link in self.parser.extract_all_links(content): full_link = self.parser.parse_links(url, link) if full_link is not None: link_promise = self.calculator.link_promise( full_link) + rel try: self.priority_queue.additem( full_link, link_promise) except: pass except KeyError: print("Queue is empty now") def bfs_crawl(self): try: harvest_rate_accum = 0 while self.webpages_crawled < int(self.webpages_limit): print(self.webpages_crawled) try: url = self.queue.get() except e: print("cannot pop") print(url) if self.urlchecker(url): try: content = self.downloader.download(url).decode('utf-8') if content is not None: self.webpages_crawled += 1 rel = self.relevance.relevance(content, self.query) harvest_rate_accum += rel self.crawled_log(" Harvest rate: " + str(harvest_rate_accum / self.webpages_crawled)) except: print("Failed in downloading") normalized_url = urltools.normalize(url) try: url_comp = urlparse(normalized_url) base_url = url_comp.scheme + "://" + url_comp.netloc + "/" except: self.logger.error("Cannot parse: " + url) self.visited_urls.add(normalized_url) for link in self.parser.extract_all_links(content): full_link = self.parser.parse_links(url, link) if full_link is not None: try: if base_url not in self.sites_times: self.sites_times[base_url] = 1 elif self.sites_times[base_url] < int( self.limit): self.sites_times[base_url] += 1 else: continue self.queue.put(full_link) except: pass except KeyError: print("Queue is empty now") def crawled_log(self, log): file = open('demo.log', 'a') file.write(log + '\n\n') file.close()