def mergeKLists(self, lists): """ :type lists: List[ListNode] :rtype: ListNode """ from Queue import PriorityQueue q = PriorityQueue() for i, l in enumerate(lists): if l: q.put((l.val, i)) try: got = q.get_nowait() except: return None res = ListNode(got[0]) lists[got[1]] = lists[got[1]].next if lists[got[1]]: q.put((lists[got[1]].val, got[1])) curr = res while True: try: got = q.get_nowait() except: return res curr.next = ListNode(got[0]) lists[got[1]] = lists[got[1]].next curr = curr.next if lists[got[1]]: q.put((lists[got[1]].val, got[1]))
def queue_sort(queue, pathTuple, opened_nodes, method): q = queue if method != "depth_limited" or pathTuple[0] != 0: for path in opened_nodes: value = calculateValue(path, method) path.fnValue = value pathTuple = (value, path) q.put_nowait(pathTuple) printLine(q, method) if method == "hill_climbing": if not q.empty(): first = q.get_nowait() q = PriorityQueue() q.put_nowait(first) if method == "beam": first = False second = False if not q.empty(): first = q.get_nowait() if not q.empty(): second = q.get_nowait() q = PriorityQueue() if first != False: q.put_nowait(first) if second != False: q.put_nowait(second) return q
def main(): args = sys.argv[1:] passes = 1000000 if args: passes = int(args[0]) print print "Using GrimHeaper..." print heap = BinaryHeap() print print "Creating heap with %d items..." % passes print started = time.time() for i in range(passes): heap.put(i) fill_time = time.time() - started print print "Heap filled with %d items after %.2fms" % (passes, fill_time) started = time.time() for i in range(passes): heap.pop() empty_time = time.time() - started print "Heap emptied with %d items after %.2fms" % (passes, empty_time) print print "Total time: %.2f" % (fill_time + empty_time) print print "Using Python's PriorityQueue..." print queue = PriorityQueue() print print "Creating queue with %d items..." % passes print started = time.time() for i in range(passes): queue.put(i) fill_time = time.time() - started print print "Queue filled with %d items after %.2fms" % (passes, fill_time) started = time.time() for i in range(passes): queue.get_nowait() empty_time = time.time() - started print "Queue emptied with %d items after %.2fms" % (passes, empty_time) print print "Total time: %.2f" % (fill_time + empty_time)
class ThreadPool(object): def __init__(self, workersLimit, queueLimit=-1): self._jobs = PriorityQueue(queueLimit) self._running = False self._workers = [] self._workersLimit = workersLimit def start(self): for _ in xrange(self._workersLimit): worker = self._createNewWorker() try: worker.start() except Exception: _logger.error('Worker has not been started properly: %r', worker) else: self._workers.append(worker) self._running = True def stop(self): self._running = False try: while True: self._jobs.get_nowait() except QueueEmptyError: pass for _ in self._workers: self._jobs.put_nowait((_LOW_PRIORITY, TerminateJob())) self._workers = [] def _createNewWorker(self): return Worker(self._jobs) def putLowPriorityJob(self, job): if not self._running: _logger.error('Thread pool is not running. Trying to put new job: %r', job) return self._jobs.put_nowait((_LOW_PRIORITY, job)) def putJob(self, job): if not self._running: _logger.error('Thread pool is not running. Trying to put new job: %r', job) return self._jobs.put_nowait((_DEFAULT_PRIORITY, job)) def __repr__(self): return '%s(workers = %d; jobs = %d)' % (self.__class__.__name__, len(self._workers), self._jobs.qsize())
def getAlphaBeta( game, player, other, reward=AlphaBeta(-100,100), depth=10, tab=0 ): ''' alpha: minimum bound of the outcome -- currently, evaluate of best move possible by other beta: maximum bound of the outcome -- currently, evaluate of best move possible by player returns: (alpha, beta, evaluation, move) ''' # evaluation is the current value of board, assuming no more moves in future # alpha == beta == finalvalue if we figure out the outcome. reward.evaluation = game.evaluate_cached( player ) move = Move(-1,-1) # base case: can't play further, lost if (reward.evaluation==reward.beta): # print 'Player', player, 'won!!!' return AlphaBetaOfMove(AlphaBeta(reward.beta, reward.beta, reward.beta), move) # base case: can't evaluate further if (depth == 0): return AlphaBetaOfMove(reward, move ) q = PriorityQueue() for m in game.next_moves( ): # g2 = copy.deepcopy( game ) assert game.move( player, m) try: oponent_reward = AlphaBeta(-reward.beta, -reward.alpha, -reward.evaluation) oponent_reward = getAlphaBeta(game, other, player, reward=oponent_reward, depth=depth-1, tab=tab+1 ).alphabeta player_reward = AlphaBeta(-oponent_reward.beta, -oponent_reward.alpha, -oponent_reward.evaluation) q.put_nowait( AlphaBetaOfMove( player_reward, m ) ) finally: assert game.unmove(player, m) # if( tab < 1 ): # print ("\t"*tab), (player,row,col), (other,other_r,other_c), (other_alpha,other_beta,other_evaluation), (next_alpha,next_beta,evaluation), update if( q.empty() ): return AlphaBetaOfMove( reward, Move(-1,-1) ) else: return q.get_nowait()
def req_proxy(self, url): from urlparse import urlparse netloc = urlparse(url).netloc busy_queue = PriorityQueue() lazy_queue = PriorityQueue() index = 0 now = datetime.utcnow() while index < self.proxy_in_queue_count(): index += 1 proxy_url = self.get_proxy_from_queue() if not proxy_url: break if proxy_url in self.proxy_meta_map: proxy_meta = self.proxy_meta_map[proxy_url] if proxy_meta.last_used_time and (now - proxy_meta.last_used_time).total_seconds() < self.settings["interval_second"]: busy_queue.put_nowait((proxy_meta.last_used_time, proxy_url)) continue if netloc in proxy_meta.latency and proxy_meta.latency[netloc][0] >= self.settings["max_unavailable_count"]: import random if random.randint(1, 10) > 1: lazy_queue.put_nowait((proxy_meta.latency[netloc], proxy_url)) continue proxy_meta.last_used_time = now proxy_meta.master = netloc return proxy_meta.proxy while not lazy_queue.empty(): _, proxy_url = lazy_queue.get_nowait() if proxy_url in self.proxy_meta_map: proxy_meta = self.proxy_meta_map[proxy_url] proxy_meta.last_used_time = now proxy_meta.master = netloc return proxy_meta.proxy return None
def __call__(self, graph, start_node, target_node): frontier = PriorityQueue() current_node = start_node distance_dict = defaultdict(lambda: infinity) distance_dict[current_node] = 0 ancestors_dict = {} visited_set = set() while True: neighbors = graph.get_neighbors(current_node) current_distance = distance_dict[current_node] for neighbor in neighbors: if neighbor not in visited_set and (current_distance + 1) < distance_dict[neighbor]: distance_dict[neighbor] = current_distance + 1 ancestors_dict[neighbor] = current_node frontier.put((self.cost_function(distance_dict[neighbor], neighbor, target_node), neighbor)) self.nodes_expanded += 1 visited_set.add(current_node) self.nodes_visited += 1 if current_node == target_node: return list(reversed(find_ancestors(ancestors_dict, current_node, start_node))) else: try: current_node = frontier.get_nowait()[1] except Empty: break
def consume_solution_queue(q): priority_queue = PriorityQueue() proposed_solutions = set() try: while True: fetch = True while fetch: try: item = q.get_nowait() if item[1] not in proposed_solutions: priorized_item = (-1 * (item[0] / len(item[1])), item[0], item[1]) priority_queue.put(priorized_item) proposed_solutions.add(item[1]) except Empty: fetch = False try: solution = priority_queue.get_nowait() print_solution(solution[1], solution[2]) except Empty: pass time.sleep(2) except KeyboardInterrupt: pass
def a_star(self): # like BFS, but puts coords with lowest heuristic (path length + manhattan dist to goal) up front pq = PriorityQueue(maxsize=0) pq.put_nowait((self.manhattan_distance(self.currPos, self.goalPos), (self.currPos, []))) visited = set() bestPath = None bestHeur = None numNodes = 0 while not pq.empty(): priority, curr = pq.get_nowait() coord, path = curr visited.add(coord) if bestPath is not None and priority >= bestHeur: pass elif self.getChar(coord) == '%': # wall pass else: # recursive case if self.getChar(coord) == '.': # goal print "Found a path:", path if bestPath is None or len(path) < len(bestPath): print "Is best path" bestPath = path[:] bestHeur = priority for adj, direction in self.adjacent(coord): if adj not in visited and self.getChar(adj) != '%': numNodes += 1 heur = len(path + direction) + self.manhattan_distance(adj, self.goalPos) if bestPath is None or heur < bestHeur: # preselect based on heuristic pq.put_nowait((heur, (adj, path + direction))) print "Num Nodes:", numNodes print self.debug(bestPath) # debug return bestPath
def a_star_penalize(self, forwardPenalty, turnPenalty): # part 1.2 # using euclidean heuristic (not manhattan) pq = PriorityQueue(maxsize=0) pq.put_nowait((self.manhattan_distance(self.currPos, self.goalPos), (self.currPos, []))) visited = set() bestPath = None bestHeur = None numNodes = 0 while not pq.empty(): priority, curr = pq.get_nowait() coord, path = curr visited.add(coord) if bestPath is not None and priority >= bestHeur: pass elif self.getChar(coord) == '%': # wall pass else: # recursive case if self.getChar(coord) == '.': # goal print "Found a path:", path if bestPath is None or len(path) < len(bestPath): bestPath = path[:] for adj, direction in self.adjacent(coord): if adj not in visited and self.getChar(adj) != '%': numNodes += 1 heur = self.calculate_penalty(path + direction, forwardPenalty, turnPenalty) + self.manhattan_distance(adj, self.goalPos) * forwardPenalty if bestPath is None or heur < bestHeur: # preselect based on heuristic pq.put_nowait((heur, (adj, path + direction))) print "Num Nodes:", numNodes print self.debug(bestPath) # debug return bestPath
def greedy(self): # like DFS, but puts coords closest to goal up front pq = PriorityQueue(maxsize=0) pq.put_nowait((self.manhattan_distance(self.currPos, self.goalPos), (self.currPos, []))) visited = set() bestPath = None numNodes = 0 while not pq.empty(): priority, curr = pq.get_nowait() coord, path = curr visited.add(coord) if bestPath is not None and len(path) >= len(bestPath): pass elif self.getChar(coord) == '%': # wall pass else: # recursive case if self.getChar(coord) == '.': # goal print "Num Nodes:", numNodes print self.debug(path) return path # return on first path found for adj, direction in self.adjacent(coord): if adj not in visited and self.getChar(adj) != '%': numNodes += 1 heur = self.manhattan_distance(adj, self.goalPos) if bestPath is None: # preselect based on heuristic pq.put_nowait((heur, (adj, path + direction))) return [] # impossible
def a_star(graph, start, goal): extendedSet = set() if start == goal: return list(start) paths = PriorityQueue() for node in graph.get_connected_nodes(start): paths.put_nowait((path_length(graph, [start, node]) + graph.get_heuristic(node, goal), [start, node])) extendedSet.add(start) while not paths.empty(): path = paths.get_nowait() if path[1][-1] == goal: return path[1] elif path[1][-1] not in extendedSet: extendedSet.add(path[1][-1]) cnodes = graph.get_connected_nodes(path[1][-1]) for node in cnodes: if path[1].count(node) == 0: epath = list(path[1]) epath.append(node) paths.put_nowait((path_length(graph, epath) + graph.get_heuristic(node, goal), epath)) return []
class MultiQueue(object): """ Simple priority queue interface to push/pull tasks Priority queue maintain the order by first element of the tuple, no futher ordering is guarantied """ def __init__(self): self.queue = PriorityQueue() def empty(self): return self.queue.empty() def pull_nowait(self): task_data = self.queue.get_nowait() if task_data: (EnterTime, User, Task) = task_data self.queue.task_done() return (User, Task) def pull_wait(self, wait): try: task_data = self.queue.get(block=True, timeout=wait) (EnterTime, User, Task) = task_data self.queue.task_done() return (User, Task) except Empty: return None def push(self, User, Tasks): EnterTime = time() for task in Tasks: self.queue.put_nowait((EnterTime, User, task))
def predict(self, image): result_priority_queue = PriorityQueue() results = [] bbs = self.align.getAllFaceBoundingBoxes(image) for bb_index, bb in enumerate(bbs): alignedFace = self.align.alignImg("affine", 96, image, bb) if alignedFace is None: continue phash = str(imagehash.phash(Image.fromarray(alignedFace))) if phash in self.trained_images: identity = self.trained_images[phash].identity result_priority_queue.put_nowait((-1.0, identity, bb_index)) else: rep = self.net.forwardImage(alignedFace) if self.svm is not None: result_proba_list = self.svm.predict_proba(rep) identity = np.argmax(result_proba_list[0]) print str(result_proba_list[0]) + " " + str(bb) for index, prob in enumerate(result_proba_list[0]): result_priority_queue.put_nowait((prob * -1.0, self.identities[index], bb_index)) else: result_priority_queue.put_nowait((0.0, -1, bb_index)) matched_identities = [] matched_bb_indices = [] threshold = 0.6 while len(matched_identities) != len(bbs) and result_priority_queue.empty() is False: detectedFaceInfo = result_priority_queue.get_nowait() identity = detectedFaceInfo[1] probability = detectedFaceInfo[0] * -1.0 bb_index = detectedFaceInfo[2] # print detectedFaceInfo if identity in matched_identities: # print "matched_bbs : " + str(matched_identities) continue matched_bb_indices.append(bb_index) matched_identities.append(identity) if probability < threshold: results.append((-1, bbs[bb_index], 0.0)) else: results.append((identity, bbs[bb_index], probability)) # print '+' + str(results[len(results) - 1]) for bb_index, bb in enumerate(bbs): if bb_index in matched_bb_indices: continue results.append((-1, bb, 0.0)) return results
def _get_backfill_events(self, txn, room_id, event_list, limit): logger.debug( "_get_backfill_events: %s, %s, %s", room_id, repr(event_list), limit ) event_results = set() # We want to make sure that we do a breadth-first, "depth" ordered # search. query = ( "SELECT depth, prev_event_id FROM event_edges" " INNER JOIN events" " ON prev_event_id = events.event_id" " AND event_edges.room_id = events.room_id" " WHERE event_edges.room_id = ? AND event_edges.event_id = ?" " AND event_edges.is_state = ?" " LIMIT ?" ) queue = PriorityQueue() for event_id in event_list: depth = self._simple_select_one_onecol_txn( txn, table="events", keyvalues={ "event_id": event_id, }, retcol="depth", allow_none=True, ) if depth: queue.put((-depth, event_id)) while not queue.empty() and len(event_results) < limit: try: _, event_id = queue.get_nowait() except Empty: break if event_id in event_results: continue event_results.add(event_id) txn.execute( query, (room_id, event_id, False, limit - len(event_results)) ) for row in txn.fetchall(): if row[1] not in event_results: queue.put((-row[0], row[1])) return event_results
def _get_backfill_events(self, txn, room_id, event_list, limit): logger.debug( "_get_backfill_events: %s, %s, %s", room_id, repr(event_list), limit ) event_results = set() # We want to make sure that we do a breadth-first, "depth" ordered # search. query = ( "SELECT depth, prev_event_id FROM event_edges" " INNER JOIN events" " ON prev_event_id = events.event_id" " AND event_edges.room_id = events.room_id" " WHERE event_edges.room_id = ? AND event_edges.event_id = ?" " AND event_edges.is_state = ?" " LIMIT ?" ) queue = PriorityQueue() for event_id in event_list: depth = self._simple_select_one_onecol_txn( txn, table="events", keyvalues={ "event_id": event_id, }, retcol="depth", allow_none=True, ) if depth: queue.put((-depth, event_id)) while not queue.empty() and len(event_results) < limit: try: _, event_id = queue.get_nowait() except Empty: break if event_id in event_results: continue event_results.add(event_id) txn.execute( query, (room_id, event_id, False, limit - len(event_results)) ) for row in txn: if row[1] not in event_results: queue.put((-row[0], row[1])) return event_results
def a_star(grid, start, end): visit_ctr = 0 print "Starting at {}".format(start) start = a_cell(start) end = a_cell(end) # Priority to sort queue on heuristic values openset = PriorityQueue() closedset = set() openset.put((start.h + start.g, start)) while openset: # Find the item in the open set with the lowest G + H score current = openset.get_nowait()[1] visit_ctr += 1 # print "Visiting at {}".format(current.val) if current.val == end.val: print "Found {}".format(current.val) path = [] while current.parent: path.append(current.val) current = current.parent path.append(current.val) return path, visit_ctr break # Add it to the closed set closedset.add(current) for neighbor in get_neighbors(grid, current.val): cell = a_cell(neighbor) # If it is already in the closed set, skip it if cell in closedset: continue # Otherwise if it is already in the open set if in_queue(cell, openset): # Check if we beat the G score new_g = current.g + 1 if cell.g > new_g: # If so, update the cell to have a new parent cell.g = new_g cell.parent = current else: # Not in open set, calculate the G and H score for cell cell.g = current.g + 1 cell.h = heuristic(cell.val, end.val) # Set the parent to our current item cell.parent = current # Add it to the set openset.put((cell.h + cell.g, cell))
class DataSource(object): """ DataSource acts as an abstract representation of the data source, though in reality it also pulls its data from the XOMBIE stream. Handles pushing data to possibly multiple listeners in a thread-safe manner. class variables: sources - a mapping from signal-names to all live data sources class methods: find - Either finds the existing data source for some signal name, or creates a new one for that signal instance variables: name - the signal name that this data source tracks, in the format id-in-hex:message-name. For example, the identifier for the Tritium Motor Drive Command Motor current is "0x501:Motor Current" queue - the internal data queue that the data source uses to pull data from the stream in a thread-safe manner data - the GraphData object that handles filtering (not used right now) and storing the data for use with collections method summary: push - notifies all listeners that new data is pending and copies any data from the internal queue to the GraphData storage pull - pulls all data from a queue into the internal data queue. Intended for initializing with accumulated data """ def __init__(self, identifier, desc=None): self.name = identifier self.queue = PriorityQueue() self.data = GraphData([]) self.descriptor = desc self.last_received = datetime.datetime(1993, 6, 20) def __hash__(self): return hash(self.name) def __eq__(self, other): return self.name == other.name def put(self, point): "Add data from the stream to the internal data queue" time, datum = point self.queue.put(point) self.last_received = max(self.last_received, time) def pull(self): "Adds all of the data from the stream's queue to its internal queue" while not self.queue.empty(): self.data.addPoint(self.queue.get_nowait()) def __repr__(self): return "DataSource(%r)" % self.name
def main(): parser = argparse.ArgumentParser(description=__doc__) parser.add_argument( '-c', '--config', default='/usr/local/factory/properties.json', help=('Specify path to the config file, ' 'default file: /usr/local/factory/properties.json')) args = parser.parse_args() with open(args.config) as f: properties = json.load(f) try: lights = properties['ui']['lights'] except Exception: lights = [] logging.warning("Can't find ui > lights entry in `%s'", args.config) try: data = properties['ui']['display']['data'] except Exception: data = [] logging.warning("Can't find ui > display > data entry in `%s'", args.config) items = lights + data queue = PriorityQueue(len(items)) for item in items: if 'poll' in item: poll = item['poll'] poll['interval'] = min(poll.get('interval', 0), 10000) queue.put((time.time(), poll)) if 'init_cmd' in item: subprocess.call(item['init_cmd'], shell=True) if queue.empty(): sys.exit(0) try: while True: (when, poll) = queue.get_nowait() if time.time() < when: # not now queue.put((when, poll)) sleep_time = when - time.time() if sleep_time > 0: time.sleep(sleep_time) else: subprocess.call(poll['cmd'], shell=True) queue.put( (time.time() + (poll['interval'] / 1000.0), poll)) except (KeyboardInterrupt, SystemExit): pass
def outlierRejection(graph, K, percent=5.0, max_dist=5.0): """ Examine graph and remove some top percentage of outliers and those outside a certain radius. """ # iterate through all points pq = PriorityQueue() marked_keys = [] for key, entry in graph["3Dmatches"].iteritems(): X = entry["3Dlocs"] # mark and continue if too far away from the origin if np.linalg.norm(X) > max_dist: marked_keys.append(key) continue # project into each frame errors = [] for frame, x in zip(entry["frames"], entry["2Dlocs"]): frame -= graph["frameOffset"] Rt = graph["motion"][frame] proj = fromHomogenous(K * Rt * toHomogenous(X)) diff = proj - x err = np.sqrt(np.multiply(diff, diff).sum()) #print (frame, err) errors.append(err) # get mean error and add to priority queue # (priority is reciprocal of error since this is a MinPQ) mean_error = np.array(errors).mean() pq.put_nowait((1.0 / mean_error, key)) # remove worst keys N = max( 0, int((percent / 100.0) * len(graph["3Dmatches"].keys())) - len(marked_keys)) for i in range(N): score, key = pq.get_nowait() del graph["3Dmatches"][key] pq.task_done() # remove keys out of range for key in marked_keys: del graph["3Dmatches"][key] print "Removed %d outliers." % (N + len(marked_keys))
class AsyncDatabaseManager(Thread): def __init__(self, directory): super(AsyncDatabaseManager, self).__init__() self.directory = directory if not os.path.exists(self.directory): open(self.directory, 'w').close() self.queue = PriorityQueue() self.event = Event() self.start() # Threading module start def run(self): super(AsyncDatabaseManager, self).run() db = sqlite3.connect(self.directory) cursor = db.cursor() while True: if self.queue.empty(): time.sleep(0.1) continue job, sql, arg, res = self.queue.get_nowait() if sql == '__close__': break cursor.execute(sql, arg) time.sleep(0.01) db.commit() if res: for rec in cursor: res.put(rec) res.put('__last__') db.close() self.event.set() # TODO: Question: Do I want the database to finish or end it when the app ends? def execute(self, sql, args=None, res=None, priority=2): self.queue.put_nowait((priority, sql, args, res)) def select(self, sql, args=None, priority=2): ''' :param: sql - command to execute :param: args - sql arguments :param: priority - 2 for system and 1 for user ''' res = Queue() self.execute(sql, args, res, priority) while True: rec = res.get() if rec == '__last__': break yield rec def close(self): self.execute('__close__')
class PrioritySet(object): def __init__(self): self.lock = RLock() self.set_ = set() self.queue = PriorityQueue() def __len__(self): with self.lock: return min(self.queue.qsize(), len(self.set_)) def __iter__(self): with self.lock: return iter(list(self.set_)) def add(self, priority, obj): item = (obj.__class__, obj.pk) LOG.debug('%s objects in queue'%len(self)) with self.lock: if item not in self.set_: self.queue.put((priority, item)) self.set_.add(item) return True def pop(self): LOG.debug("Trying to pop from queue") while 1: try: with self.lock: priority, item = self.queue.get_nowait() self.set_.remove(item) except Empty: LOG.debug("queue is empty") raise KeyError("pop from an empty set") except KeyError: LOG.error("item not in self.set_") continue except: LOG.exception("An error occured while getting an item in queue") else: klass, pk = item try: return klass.objects.get(pk=pk) except klass.DoesNotExist: LOG.warn("%s don't exist anymore"%((priority, item),)) if priority < 100: with self.lock: self.queue.put((priority+10, item)) self.set_.add(item) continue
def getAlphaBeta(game, player, other, reward=AlphaBeta(-100, 100), depth=10, tab=0): ''' alpha: minimum bound of the outcome -- currently, evaluate of best move possible by other beta: maximum bound of the outcome -- currently, evaluate of best move possible by player returns: (alpha, beta, evaluation, move) ''' # evaluation is the current value of board, assuming no more moves in future # alpha == beta == finalvalue if we figure out the outcome. reward.evaluation = game.evaluate_cached(player) move = Move(-1, -1) # base case: can't play further, lost if (reward.evaluation == reward.beta): # print 'Player', player, 'won!!!' return AlphaBetaOfMove( AlphaBeta(reward.beta, reward.beta, reward.beta), move) # base case: can't evaluate further if (depth == 0): return AlphaBetaOfMove(reward, move) q = PriorityQueue() for m in game.next_moves(): # g2 = copy.deepcopy( game ) assert game.move(player, m) try: oponent_reward = AlphaBeta(-reward.beta, -reward.alpha, -reward.evaluation) oponent_reward = getAlphaBeta(game, other, player, reward=oponent_reward, depth=depth - 1, tab=tab + 1).alphabeta player_reward = AlphaBeta(-oponent_reward.beta, -oponent_reward.alpha, -oponent_reward.evaluation) q.put_nowait(AlphaBetaOfMove(player_reward, m)) finally: assert game.unmove(player, m) # if( tab < 1 ): # print ("\t"*tab), (player,row,col), (other,other_r,other_c), (other_alpha,other_beta,other_evaluation), (next_alpha,next_beta,evaluation), update if (q.empty()): return AlphaBetaOfMove(reward, Move(-1, -1)) else: return q.get_nowait()
def outlierRejection(graph, K, percent=5.0, max_dist=5.0): """ Examine graph and remove some top percentage of outliers and those outside a certain radius. """ # iterate through all points pq = PriorityQueue() marked_keys = [] for key, entry in graph["3Dmatches"].iteritems(): X = entry["3Dlocs"] # mark and continue if too far away from the origin if np.linalg.norm(X) > max_dist: marked_keys.append(key) continue # project into each frame errors = [] for frame, x in zip(entry["frames"], entry["2Dlocs"]): frame -= graph["frameOffset"] Rt = graph["motion"][frame] proj = fromHomogenous(K * Rt * toHomogenous(X)) diff = proj - x err = np.sqrt(np.multiply(diff, diff).sum()) #print (frame, err) errors.append(err) # get mean error and add to priority queue # (priority is reciprocal of error since this is a MinPQ) mean_error = np.array(errors).mean() pq.put_nowait((1.0 / mean_error, key)) # remove worst keys N = max(0, int((percent/100.0) * len(graph["3Dmatches"].keys())) - len(marked_keys)) for i in range(N): score, key = pq.get_nowait() del graph["3Dmatches"][key] pq.task_done() # remove keys out of range for key in marked_keys: del graph["3Dmatches"][key] print "Removed %d outliers." % (N + len(marked_keys))
def ShortestPath(startNode, destinationNodes): # Dijkstra w/ priority queue. Infinity = 999999999999 distance = defaultdict(lambda: Infinity) predecessor = defaultdict(lambda: None) queued = defaultdict(lambda: False) nextNodes = PriorityQueue() nextNodes.put_nowait((0, startNode)) queued[startNode] = True distance[startNode] = startNode.weight while True: try: priority, node = nextNodes.get_nowait() queued[node] = False except Empty: break for neighbor in node.edges: alternate = distance[node] + neighbor.weight if alternate < distance[neighbor]: distance[neighbor] = alternate predecessor[neighbor] = node if not queued[neighbor]: nextNodes.put_nowait((alternate, neighbor)) queued[neighbor] = True destinationDistances = [ (distance[node], node) for node in destinationNodes] bestCost, bestDestination = sorted(destinationDistances)[0] # For the best destination node, construct the path taken to get there. path = [bestDestination] node = bestDestination while True: node = predecessor[node] if node is None: break path.insert(0, node) return bestCost, path
def ShortestPath(startNode, endNode): # Dijkstra w/ priority queue. Infinity = 999999999999 distance = defaultdict(lambda: Infinity) predecessor = defaultdict(lambda: None) queued = defaultdict(lambda: False) nextNodes = PriorityQueue() nextNodes.put_nowait((0, startNode)) queued[startNode] = True distance[startNode] = startNode.weight while True: try: priority, node = nextNodes.get_nowait() queued[node] = False except Empty: break for neighbor in node.edges: alternate = distance[node] + neighbor.weight if alternate < distance[neighbor]: distance[neighbor] = alternate predecessor[neighbor] = node if not queued[neighbor]: nextNodes.put_nowait((alternate, neighbor)) queued[neighbor] = True # Dijkstra done here. Now we process the results. cost = distance[endNode] # Construct the path taken to get there. path = [endNode] node = endNode while True: node = predecessor[node] if node is None: break path.insert(0, node) return cost, path
def distance(self, i, j): lon = self.pixel_lons[i] lat = self.pixel_lats[j] point = Point(lat, lon) elements = PriorityQueue() elements.put_nowait((self.grid.distance(point), self.grid)) # We iterate over the priority queue until the nearest element is a point. While it isn't we add its children to the queue. while True: (distance, elem) = elements.get_nowait() #print "Iterating (%d, %d) distance: %f" % (i, j, distance) if isinstance(elem, Point): return distance else: for child in elem.children: elements.put_nowait((child.distance(point), child))
class JobQueue(object): def __init__(self): self._priorityQueue = PriorityQueue() def put(self,job,priority_metric): self._priorityQueue.put_nowait((-priority_metric,job)) def get(self): try: return self._priorityQueue.get_nowait()[1] except Exception: return None def __iter__(self): job = True while job: job = self.get() yield job
def UCS(self): queue = PriorityQueue() queue.put([0, self.problem.tabuleiro, '']) visitados = [] while queue: self.num_visited += 1 if self.max_mem < queue.qsize(): self.max_mem = queue.qsize() custo, node, caminho = queue.get_nowait() visitados.append(node) if self.problem.testeObjetivo(node): return caminho for suc, move in self.problem.sucessores(node): if suc not in visitados: queue.put([custo + 1, suc, caminho + move])
class MaxQueue(object): ''' A priority queue sorted in descending order instead of ascending order If maxlength > 0, queue keeps only the maxlength entries with highest values not memory efficient in python since small memory is not reused # reduce length to maxlength if queue is too long if self.maxlength > 0 and len(self) > self.maxlength + self.length_tol: with self.decrease_length: print 'Reducing length of MaxQueue by removing values less than', print 'Used memory before = ', self.memory_mon.usage() new_pq = PriorityQueue() for i in range(self.maxlength - 1): new_pq.put(self.pq.get()) print -self.pq.get()[0], 'from consideration' self.pq = new_pq print 'Used memory after = ', self.memory_mon.usage() ''' def __init__(self, maxlength = 0, length_tol = 1000): self.pq = PriorityQueue() self.decrease_length = multiprocessing.Lock() self.maxlength = maxlength self.length_tol = length_tol def put(self,tup): # add new item to queue self.pq.put((-tup[0],tup[1])) def get(self,block=True): tup = self.pq.get(block) return (-tup[0],tup[1]) def get_nowait(self): tup = self.pq.get_nowait() return (-tup[0],tup[1]) def __len__(self): return self.pq.qsize() def empty(self): return self.pq.empty()
class MaxQueue(object): ''' A priority queue sorted in descending order instead of ascending order If maxlength > 0, queue keeps only the maxlength entries with highest values not memory efficient in python since small memory is not reused # reduce length to maxlength if queue is too long if self.maxlength > 0 and len(self) > self.maxlength + self.length_tol: with self.decrease_length: print 'Reducing length of MaxQueue by removing values less than', print 'Used memory before = ', self.memory_mon.usage() new_pq = PriorityQueue() for i in range(self.maxlength - 1): new_pq.put(self.pq.get()) print -self.pq.get()[0], 'from consideration' self.pq = new_pq print 'Used memory after = ', self.memory_mon.usage() ''' def __init__(self, maxlength=0, length_tol=1000): self.pq = PriorityQueue() self.decrease_length = multiprocessing.Lock() self.maxlength = maxlength self.length_tol = length_tol def put(self, tup): # add new item to queue self.pq.put((-tup[0], tup[1])) def get(self, block=True): tup = self.pq.get(block) return (-tup[0], tup[1]) def get_nowait(self): tup = self.pq.get_nowait() return (-tup[0], tup[1]) def __len__(self): return self.pq.qsize() def empty(self): return self.pq.empty()
def branch_and_bound(graph, start, goal): if start == goal: return list(start) paths = PriorityQueue() for node in graph.get_connected_nodes(start): paths.put_nowait((path_length(graph, [start, node]), [start, node])) while not paths.empty(): path = paths.get_nowait() if path[1][-1] == goal: return path[1] else: cnodes = graph.get_connected_nodes(path[1][-1]) for node in cnodes: if path[1].count(node) == 0: epath = list(path[1]) epath.append(node) paths.put_nowait((path_length(graph, epath), epath)) return []
def getTwoBestClusters(self): best_clusters = PriorityQueue() cnt = 0 for centroid, stats in self.clusters.iteritems(): avg_mass = stats["total_mass"] / stats["size"] best_clusters.put_nowait((avg_mass, centroid)) cnt += 1 if cnt < 2: print "Warning. Only found %d clusters." % cnt return [], [] avg_mass, cluster1 = best_clusters.get_nowait(); best_clusters.task_done() avg_mass, cluster2 = best_clusters.get_nowait(); best_clusters.task_done() centroids = [cluster1, cluster2] scores = [self.clusters[cluster1]["total_mass"], self.clusters[cluster2]["total_mass"]] return centroids, scores
class PriorityLock(object): def __init__(self): self._is_available = True self._mutex = Lock() self._waiter_queue = PriorityQueue() def acquire(self, priority=0): self._mutex.acquire() # First, just check the lock. if self._is_available: self._is_available = False self._mutex.release() return True condition = Condition() condition.acquire() self._waiter_queue.put((priority, condition)) self._mutex.release() condition.wait() condition.release() return True def release(self): self._mutex.acquire() # Notify the next thread in line, if any. try: _, condition = self._waiter_queue.get_nowait() except Empty: self._is_available = True else: condition.acquire() condition.notify() condition.release() self._mutex.release() def __enter__(self): self.acquire() return self def __exit__(self, type, value, traceback): self.release()
def ASTAR(self, heuristic): queue = PriorityQueue() queue.put([0, self.problem.tabuleiro, '']) visitados = [] while not queue.empty(): self.num_visited += 1 if self.max_mem < queue.qsize(): self.max_mem = queue.qsize() custo, node, caminho = queue.get_nowait() visitados.append(node) if self.problem.testeObjetivo(node): return caminho for suc, move in self.problem.sucessores(node): if suc not in visitados: if heuristic == 1: queue.put([self.heuristic_manhattan(suc) + custo + 1, suc, caminho + move]) else: queue.put([self.heuristic_full_manhattan(suc) + custo + 1, suc, caminho + move])
def build_dict(self): from Queue import PriorityQueue token_count_dict = {} with open(self.input_file) as f: print "Creating Dictionary..." line_count = 0 for line in f: token_list = re.findall(expression, line.lower()) for token in token_list: if token not in token_count_dict: token_count_dict[token] = 1 else: token_count_dict[token] += 1 line_count += 1 print "Lines in the Dataset: " + str(line_count) q = PriorityQueue() for t in token_count_dict: q.put([-token_count_dict[t], t]) self.token_dict = {} #add special token self.token_dict[zero_token] = 0 self.token_dict[unknown_token] = 1 self.token_dict[start_token] = 2 self.token_dict[end_token] = 3 token_index = 4 token_count_dict = {} #priority queue while (not q.empty()): get = q.get_nowait() self.token_dict[get[1]] = token_index token_index += 1
class TaskChain(object): def __init__(self): self.task_chain = PriorityQueue() self.task_num = 0 def put(self, task): try: self.task_chain.put_nowait((self.task_num, task)) self.task_num += 1 except Queue.Full as e: raise e def get(self): try: priority, task = self.task_chain.get_nowait() self.task_num -= 1 return task except Queue.Empty as e: raise e def size(self): return self.task_num
def a_star_ghost(self): pq = PriorityQueue(maxsize=0) pq.put_nowait((self.manhattan_distance(self.currPos, self.goalPos), (self.currPos, []))) visited = set() bestPath = None bestHeur = None numNodes = 0 backwardsPenalty = len(self.maze) * len(self.maze[0]) / 2 # backwards penalty to allow loitering while not pq.empty(): priority, curr = pq.get_nowait() coord, path = curr visited.add(coord) if bestPath is not None and priority >= bestHeur: pass elif self.getChar(coord) == '%': # wall pass else: # recursive case if self.getChar(coord) == '.': # goal print "Found a path:", path if bestPath is None or len(path) < len(bestPath): print "Is best path" bestPath = path[:] bestHeur = priority for adj, direction in self.adjacent(coord): if self.getChar(adj) != '%': numNodes += 1 heur = len(path + direction) + self.manhattan_distance(adj, self.goalPos) if adj in visited: heur += backwardsPenalty if bestPath is None or heur < bestHeur: # preselect based on heuristic if adj != self.getGhostPos(path + direction) and (adj != self.getGhostPos(path) and coord != self.getGhostPos(path + direction)): # check that next step won't put pacman on same square as ghost, or won't cross paths with ghost pq.put_nowait((heur, (adj, path + direction))) print "Num Nodes:", numNodes print self.debug(bestPath) # debug return bestPath
def find_feasible_schedule(para, rtn): """find a feasible schedule""" # todo: do not work for G6 # todo or, expand the time horizon and assign a bigM price for t>96 # todo the following is simply a greedy alg # todo offset to the most cheapst hours # todo need test for 3EAF,2AOD,4LF,1CC rtn_casters = rtn.steel_rtn.casters task_start = [-100] + [-1] * para.num_tasks heat_ready_q2 = PriorityQueue() # get caster start asap by arranging heat priority # group_time_q = PriorityQueue() # for group_ in range(object.num_groups): # total_slot = 0 # for heat in object.optmath.steel_rtn.group2heats[group_+1]: # for s in object.process_sequence: # total_slot += object.optmath.steel_rtn.task_length[object.optmath.steel_rtn.tasks[s][heat-1]] # total_slot += max([object.optmath.steel_rtn.task_cleanup_length[object.optmath.steel_rtn.tasks[unit][group_]] for unit in casters]) # group_time_q.put_nowait((total_slot,group_+1)) # equip_count = 0 # num_eaf = object.optmath.steel_rtn.stage2units['1']['EAF'] # while not group_time_q.empty(): # (total_slot, group) = group_time_q.get_nowait() # for heat in object.optmath.steel_rtn.group2heats[group]: # heat_ready_q2.put_nowait((math.floor(equip_count/num_eaf),heat-1)) # equip_count += 1 for heat_ in range(para.num_heats): heat_ready_q2.put_nowait((0, heat_)) for seq in [0, 2, 4]: heat_ready_q = heat_ready_q2 heat_ready_q2 = PriorityQueue() task_type = para.heat_sequence[seq] equip_time = [0] * para.unit2num[task_type] equip_id = 0 while not heat_ready_q.empty(): (ready_t, heat_) = heat_ready_q.get_nowait() task = rtn.steel_rtn.tasks[task_type][heat_] equip_time[equip_id] = max(equip_time[equip_id], ready_t) task_start[task] = equip_time[equip_id] equip_time[equip_id] += rtn.steel_rtn.task_length[task] trans_time = rtn.steel_rtn.task_length[task + para.num_heats] heat_ready_q2.put_nowait((equip_time[equip_id] + trans_time, heat_)) equip_id = (equip_id + 1) % len(equip_time) # casting heat_ready_list = [-1] * para.num_heats while not heat_ready_q2.empty(): (read_t, heat_) = heat_ready_q2.get_nowait() heat_ready_list[heat_] = read_t group_ready_t = dict() caster_time = [0] * len(rtn_casters) for caster in rtn_casters: group_ready_t[caster] = PriorityQueue() # group ready time for group_ in range(para.num_groups): heats = rtn.steel_rtn.group2heats[group_ + 1] for caster in rtn_casters: read_t = [heat_ready_list[heat - 1] - rtn.steel_rtn.cast_heat_rel_slot[caster][heat] for heat in heats] group_ready_t[caster].put_nowait((max(read_t), group_)) scheduled_groups = [] while len(scheduled_groups) < para.num_groups: caster_id = np.argmin(caster_time) (read_t, group_) = group_ready_t[rtn_casters[caster_id]].get_nowait() while group_ in scheduled_groups: (read_t, group_) = group_ready_t[rtn_casters[caster_id]].get_nowait() scheduled_groups.append(group_) schedule_time = max(read_t, caster_time[caster_id]) schedule_task = rtn.steel_rtn.tasks[rtn_casters[caster_id]][group_] caster_time[caster_id] = schedule_time + rtn.steel_rtn.task_cleanup_length[schedule_task] task_start[schedule_task] = schedule_time task_time = [(-100, -100)] + [(-1, -1)] * rtn.steel_rtn.num_tasks # task counts from 1 for task in range(1, para.num_tasks + 1): if task_start[task] < 0: continue task_time[task] = (task_start[task], task_start[task] + 1) return task_time
def find_feasible_schedule(para, rtn): """find a feasible schedule""" # todo: do not work for G6 # todo or, expand the time horizon and assign a bigM price for t>96 # todo the following is simply a greedy alg # todo offset to the most cheapst hours # todo need test for 3EAF,2AOD,4LF,1CC rtn_casters = rtn.steel_rtn.casters task_start = [-100] + [-1] * para.num_tasks heat_ready_q2 = PriorityQueue() # get caster start asap by arranging heat priority # group_time_q = PriorityQueue() # for group_ in range(object.num_groups): # total_slot = 0 # for heat in object.optmath.steel_rtn.group2heats[group_+1]: # for s in object.process_sequence: # total_slot += object.optmath.steel_rtn.task_length[object.optmath.steel_rtn.tasks[s][heat-1]] # total_slot += max([object.optmath.steel_rtn.task_cleanup_length[object.optmath.steel_rtn.tasks[unit][group_]] for unit in casters]) # group_time_q.put_nowait((total_slot,group_+1)) # equip_count = 0 # num_eaf = object.optmath.steel_rtn.stage2units['1']['EAF'] # while not group_time_q.empty(): # (total_slot, group) = group_time_q.get_nowait() # for heat in object.optmath.steel_rtn.group2heats[group]: # heat_ready_q2.put_nowait((math.floor(equip_count/num_eaf),heat-1)) # equip_count += 1 for heat_ in range(para.num_heats): heat_ready_q2.put_nowait((0, heat_)) for seq in [0, 2, 4]: heat_ready_q = heat_ready_q2 heat_ready_q2 = PriorityQueue() task_type = para.heat_sequence[seq] equip_time = [0] * para.unit2num[task_type] equip_id = 0 while not heat_ready_q.empty(): (ready_t, heat_) = heat_ready_q.get_nowait() task = rtn.steel_rtn.tasks[task_type][heat_] equip_time[equip_id] = max(equip_time[equip_id], ready_t) task_start[task] = equip_time[equip_id] equip_time[equip_id] += rtn.steel_rtn.task_length[task] trans_time = rtn.steel_rtn.task_length[task + para.num_heats] heat_ready_q2.put_nowait( (equip_time[equip_id] + trans_time, heat_)) equip_id = (equip_id + 1) % len(equip_time) # casting heat_ready_list = [-1] * para.num_heats while not heat_ready_q2.empty(): (read_t, heat_) = heat_ready_q2.get_nowait() heat_ready_list[heat_] = read_t group_ready_t = dict() caster_time = [0] * len(rtn_casters) for caster in rtn_casters: group_ready_t[caster] = PriorityQueue() # group ready time for group_ in range(para.num_groups): heats = rtn.steel_rtn.group2heats[group_ + 1] for caster in rtn_casters: read_t = [ heat_ready_list[heat - 1] - rtn.steel_rtn.cast_heat_rel_slot[caster][heat] for heat in heats ] group_ready_t[caster].put_nowait((max(read_t), group_)) scheduled_groups = [] while len(scheduled_groups) < para.num_groups: caster_id = np.argmin(caster_time) (read_t, group_) = group_ready_t[rtn_casters[caster_id]].get_nowait() while group_ in scheduled_groups: (read_t, group_) = group_ready_t[rtn_casters[caster_id]].get_nowait() scheduled_groups.append(group_) schedule_time = max(read_t, caster_time[caster_id]) schedule_task = rtn.steel_rtn.tasks[rtn_casters[caster_id]][group_] caster_time[ caster_id] = schedule_time + rtn.steel_rtn.task_cleanup_length[ schedule_task] task_start[schedule_task] = schedule_time task_time = [ (-100, -100) ] + [(-1, -1)] * rtn.steel_rtn.num_tasks # task counts from 1 for task in range(1, para.num_tasks + 1): if task_start[task] < 0: continue task_time[task] = (task_start[task], task_start[task] + 1) return task_time
from Queue import PriorityQueue from fractions import Fraction nums = {} seen = set() pq = PriorityQueue() frac = Fraction(1, 2), Fraction(1, 2) pq.put_nowait(((frac[0] + frac[1]).denominator, (frac[0], frac[1]))) while not pq.empty(): weight, (frac1, frac2) = pq.get_nowait() hashing = frac1.denominator, frac2.denominator if hashing not in seen: seen.add(hashing) if weight not in nums: nums[weight] = 0 nums[weight] += 1 print '{} + {} = {}'.format(frac1, frac2, frac1 + frac2) if nums[weight] > 5: print weight print nums break fracp = Fraction(1, frac1.denominator + 1), frac2 fracpp = frac1, Fraction(1, frac2.denominator + 1) pq.put_nowait( (((fracp[0] + fracp[1]).denominator), (fracp[0], fracp[1]))) pq.put_nowait( (((fracpp[0] + fracpp[1]).denominator), (fracpp[0], fracpp[1])))
class AsyncoreReactor(object): _thread = None _is_live = False logger = logging.getLogger("Reactor") def __init__(self): self._timers = PriorityQueue() self._map = {} def start(self): self._is_live = True self._thread = threading.Thread(target=self._loop, name="hazelcast-reactor") self._thread.daemon = True self._thread.start() def _loop(self): self.logger.debug("Starting Reactor Thread") Future._threading_locals.is_reactor_thread = True while self._is_live: try: asyncore.loop(count=1000, timeout=0.01, map=self._map) self._check_timers() except select.error as err: # TODO: parse error type to catch only error "9" pass except: self.logger.exception("Error in Reactor Thread") # TODO: shutdown client return self.logger.debug("Reactor Thread exited.") def _check_timers(self): now = time.time() while not self._timers.empty(): try: _, timer = self._timers.queue[0] except IndexError: return if timer.check_timer(now): self._timers.get_nowait() else: return def add_timer_absolute(self, timeout, callback): timer = Timer(timeout, callback, self._cleanup_timer) self._timers.put_nowait((timer.end, timer)) return timer def add_timer(self, delay, callback): return self.add_timer_absolute(delay + time.time(), callback) def shutdown(self): for connection in self._map.values(): try: connection.close(HazelcastError("Client is shutting down")) except OSError, connection: if connection.args[0] == socket.EBADF: pass else: raise self._map.clear() self._is_live = False self._thread.join()
class RandomDelayedAction(threading.Thread): def __init__(self): threading.Thread.__init__(self) # Job queue self._pq = PriorityQueue() self._pq_lock = threading.RLock() self._exec_lock = threading.RLock() if NO_OP: return # Capture ingress SYN/ACK traffic into queue in a separate process. self._pkt_queue = multiprocessing.Queue() pcap_p = multiprocessing.Process(target=_pcap_process, args=(self._pkt_queue,)) pcap_p.daemon = True pcap_p.start() # Introduce packet delays based on real performance. self._pkt_in_profiler = DelayProfiler("./profile/%s/%s-pkt-in.csv" % (DELAY_PROFILE_TYPE, DELAY_PROFILE)) self._flow_mod_profiler = DelayProfiler("./profile/%s/%s-flow-mod.csv" % (DELAY_PROFILE_TYPE, DELAY_PROFILE)) # Part of the ovs overhead that has not been accounted for. self._unused_ovs_overhead = 0 # Start loop that executes jobs and that processes tcpdump output. self.daemon = True self.start() print "*" * 80 print 'Delayed Action, using profile "%s"-"%s".' % (DELAY_PROFILE_TYPE, DELAY_PROFILE) print "*" * 80 def _get_delay(self, filter_obj): if NO_OP: return 0 if isinstance(filter_obj, ofp_packet_in): return self._pkt_in_profiler.get_delay() elif isinstance(filter_obj, ofp_flow_mod): return self._flow_mod_profiler.get_delay() return 0 def add_job(self, filter_obj, func, *args, **kwargs): delay = self._get_delay(filter_obj) - MAGIC_OVERHEAD if delay <= 0.002: return self._execute(func, *args, **kwargs) elif delay > 5: return # Drop straight away current_time = time.time() # Compensate for OVS overhead, but only for packet-in events. if isinstance(filter_obj, ofp_packet_in): pkt_in = args[1] (src_port, dst_port) = _get_tcp_src_dst_ports(pkt_in.data) if src_port and dst_port: ovs_overhead = self._get_ovs_overhead(src_port, dst_port, current_time) ovs_overhead += self._unused_ovs_overhead delay = delay - ovs_overhead if delay <= 0: # self._unused_ovs_overhead += 0.0 - delay #TODO: Should we do this? return self._execute(func, *args, **kwargs) # Add event to job queue. with self._pq_lock: self._pq.put((delay + current_time, func, args, kwargs)) def run(self): if NO_OP: return while True: # Peek current_time = time.time() try: with self._pq_lock: (next_time, _, _, _) = self._pq.queue[0] if current_time < next_time: raise IndexError except IndexError: time.sleep(0.001) continue # Pop try: with self._pq_lock: (_, func, args, kwargs) = self._pq.get_nowait() except Empty: continue # Run the job. self._execute(func, *args, **kwargs) def _get_ovs_overhead(self, src_port, dst_port, current_time, max_attempt=5): """ Continuously asks if pcap has seen <src_port, dst_port>. Stops when it appears in the pcap history. Extract the pcap time. Based on the current time, we can compute and return the overhead as a result of OVS. """ # Average loop count is around 2. for _ in range(max_attempt): try: (timestamp, src, dst) = self._pkt_queue.get_nowait() except Empty: return 0 # What usually happens is pcap cannot keep up if src == src_port and dst == dst_port: return current_time - timestamp + 0.001 # Magic number return 0 # Almost never happens. def _execute(self, func, *args, **kwargs): try: with self._exec_lock: func(*args, **kwargs) except Exception, err: print >> sys.stderr, "DelayedAction exception:", err print >> sys.stderr, traceback.format_exc()
def learnDT(learnDTNode, func, dist, initCons, params): # Step 1: Initialize decision tree, score, and worklist # The decision tree is represented by dt, which is a map of type # # type params: # I : internal node # L : leaf node # # types: # dt : {int : (I * _DT_INTERNAL) | (L * _DT_LEAF) } dt = {} worklist = PriorityQueue() index = 0 depth = 1 (dtInternalData, dtInternalScore, dtLeafData, dtLeafScore) = learnDTNode(func, dist, initCons) gain = dtInternalScore - dtLeafScore worklist.put_nowait((-gain, dtInternalData, dtLeafData, index, depth)) score = dtLeafScore size = 1 # Step 2: Iterate through the worklist and construct internal nodes while True: # Step 2a: Get the next element (break if worklist is empty) if worklist.empty(): break (minusGain, dtInternalData, dtLeafData, index, depth) = worklist.get_nowait() gain = -minusGain log('Internal node index: ' + str(index), INFO) # Step 2b: Get the internal data, and add to decision tree if dtInternalData is None: log('No internal data!', INFO) worklist.put_nowait( (2.0, dtInternalData, dtLeafData, index, depth)) if gain < -1.5: log('No internal nodes remaining, ending!', INFO) break else: continue (dtInternalNode, lcons, rcons) = dtInternalData dt[index] = (dtInternalNode, _DT_INTERNAL) # Step 2c: Learn the left and right children (dtInternalDataLeft, dtInternalScoreLeft, dtLeafDataLeft, dtLeafScoreLeft) = learnDTNode(func, dist, lcons) (dtInternalDataRight, dtInternalScoreRight, dtLeafDataRight, dtLeafScoreRight) = learnDTNode(func, dist, rcons) gainLeft = dtInternalScoreLeft - dtLeafScoreLeft gainRight = dtInternalScoreRight - dtLeafScoreRight # Step 2d: Add children to worklist worklist.put_nowait((-gainLeft, dtInternalDataLeft, dtLeafDataLeft, 2 * index + 1, depth + 1)) worklist.put_nowait((-gainRight, dtInternalDataRight, dtLeafDataRight, 2 * index + 2, depth + 1)) # Step 2e: Compute score score += gain size += 2 log('Current gain: ' + str(gain), INFO) log('Current score: ' + str(score), INFO) log('Current size: ' + str(size), INFO) # Step 2f: Check stopping conditions if not params.minGain is None and gain < params.minGain: log('Gain too small, ending!', INFO) break if not params.tgtScore is None and score >= params.tgtScore: log('Achieved target score, ending!', INFO) break if not params.maxSize is None and size >= params.maxSize: log('Reached maximum size, ending!', INFO) break if gain < -1.5: log('No internal nodes remaining, ending!', INFO) break # Step 3: Iterate through remaining nodes and construct leaf nodes while not worklist.empty(): (minusGain, dtInternalData, dtLeafData, index, depth) = worklist.get_nowait() gain = -minusGain if dtInternalData is None and gain > 0.0: raise Exception('None node with non-zero gain: ' + str(dtInternalData)) log('Leaf node index: ' + str(index), INFO) dt[index] = (dtLeafData, _DT_LEAF) # Step 4: Construct the decision tree return DT(_learnDTHelper(dt, 0))
def parse(sequence, expects, timeout): q = PriorityQueue() # The unprocessed items end up into the queue # Parsing constantly fills it up, and cannot # progress after it becomes empty. wait = defaultdict(list) # Items that wait for reduction. (start, rule) -> [item] fini = defaultdict(list) # Items that have been finished at (start, rule) -> [(r_badness, stop, value)] halt = time() + timeout # When we should give up. for rule in expects: # The queue is populated with initial starting states. if valid_compound(rule): q.put((0, 0, 0, rule, [])) while not q.empty(): if halt < time(): raise Exception("timeout") badness, start, index, rule, matches = q.get_nowait() # Queue is filled up from the results of shifting. if ((isinstance(rule, Group) and len(rule) == len(matches)) or (isinstance(rule, Plus) and len(matches) >= 1) or (isinstance(rule, Star))): # If shifting results in completely reduced construct, we want to reduce using it. # Reduction usually results in one or more shifts and it is stored # to allow worse reductions with the same rule again. if start == 0 and index == len(sequence): yield Reduction(rule, matches, badness) halt = time() + timeout # reset halt when we succeed. continue else: result = Reduction(rule, matches, badness) fini[(start, rule)].append((index, result)) for g_badness, g_start, g_rule, g_matches in wait[(start, rule)]: q.put(( g_badness + result.badness, g_start, index, g_rule, g_matches + [result])) if isinstance(rule, Group): continue if index >= len(sequence): # Some rules may appear at positions where they cannot complete. continue subrule = rule.at(len(matches)) subrules = () match = subrule.match(sequence[index]) if subrule.validate(sequence[index]) and match[1]: if isinstance(match[1], Keyword): # Operator inserted where Keyword matches. shift_badness = 1 term = Operator(match[1], sequence[index]) else: shift_badness = 10 term = sequence[index] q.put(( badness + shift_badness, start, index + 1, rule, matches + [term])) # Even if rule matched to a symbol or construct, it may match other ways too if isinstance(subrule, ListRule): subrules = [(10, subrule)] elif isinstance(subrule, Context): # Larger constructs with many indirections # are treated as worse results. subrules = [(100, d_rule) for d_rule in subrule.rules if valid_compound(d_rule)] for pre, ind_rule in subrule.indirect_rules: if valid_compound(ind_rule): subrules.append((100 + len(pre)*10, ind_rule)) # If there are rules that can reduce, we shift with them. # Otherwise we add a blank shift to parse the rule and initiate fini to fill up. for b_badness, subrule in subrules: if fini.has_key((index, subrule)): for stop, result in fini[(index, subrule)]: q.put(( b_badness + badness + result.badness, start, stop, rule, matches + [result])) elif isinstance(rule, ListRule): q.put((0, index, index, subrule, [])) # Avoid recursion fini[(start, rule)] = [] # Even if fini contained items, at this point we're not sure if # fini still fills up, so we need to add a wait every time. wait[(index, subrule)].append((b_badness+badness, start, rule, matches))
class AWS(object): """Abstraction of an AWS state. Typically this is fed `INSTANCES` on startup (via `Mock`) and this maintains information on state changes on those instances later, for example, terminated instances are marked terminated etc.""" STABLE_INSTANCE_STATES = ('running', 'stopped', 'terminated') def __init__(self, instances=[]): """Initialize AWS state mockup from given list of `instances`. Each `instance` record in `instances` is a dictionary. See global `INSTANCES` for example on how to configure.""" self.log = logging.getLogger('freezr.systemtests.aws.AWS') self.instances = {} self.count = 0 self.ops = PriorityQueue() for instance in deepcopy(instances): self.add_instance(instance) def add_instance(self, data): """Adds a single instance data. This will set meaningful defaults on any missing fields (including instance id, which is autogenerated if missing). If the initial instance state is in a transitioning state it'll be scheduled for later update automatically.""" self.count += 1 instance = { 'id': 'i-%06d' % (self.count,), 'region': DEFAULT_REGION, 'root_device_type': DEFAULT_ROOT_DEVICE_TYPE, 'instance_type': DEFAULT_INSTANCE_TYPE, 'state': DEFAULT_STATE, 'vpc_id': DEFAULT_VPC_ID, 'tags': {}, } instance.update(data) self.instances[instance['id']] = instance if instance['state'] not in self.STABLE_INSTANCE_STATES: self.later(10, self.instance_state_proceed, instance) self.log.debug('Added instance: %r', instance) def later(self, secs, fn, *args, **kwargs): """Schedule an operation a minimum of `secs` later, calling `fn` with args `args` and kwargs `kwargs`.""" when = time() + secs op = (when, lambda: fn(*args, **kwargs)) self.ops.put(op) self.log.debug("Added later %.1fs: %r", secs, op) def tick(self): """'Tick' the AWS state by checking whether there are any pending operations (see `later`) that should be run before proceeding.""" self.log.debug("tick (%d ops)", self.ops.qsize()) while not self.ops.empty(): when, call = self.ops.get_nowait() # not yet? if time() < when: self.log.debug("Task due in %.1fs, put it back", when - time()) self.ops.put((when, call)) return self.log.debug("Running task due for %.1fs: %r", when, call) call() def get_instances(self): """Return a list of instances. The returned list elements try to mimic the behavior of `boto.ec2.instances.Instance` to the extent needed by freezr.""" self.tick() self.log.debug("get_instances: %d instances", len(self.instances)) return [AttrDict(instance) for instance in self.instances.values()] def terminate_instance(self, id): self.tick() self.log.debug("terminate_instance: %r", id) instance = self.instances[id] assert instance['state'] == 'running' instance['state'] = 'terminating' self.later(10, self.instance_state_proceed, instance) def stop_instance(self, id): self.tick() self.log.debug("stop_instance: %r", id) instance = self.instances[id] assert instance['state'] == 'running' instance['state'] = 'stopping' self.later(10, self.instance_state_proceed, instance) def start_instance(self, id): self.tick() self.log.debug("start_instance: %r", id) instance = self.instances[id] assert instance['state'] == 'stopped' instance['state'] = 'pending' self.later(10, self.instance_state_proceed, instance) # operations on instances def instance_state_proceed(self, instance): """Given an instance that is in a transitioning state, move it to the matching stable state (e.g. "pending" -> "running", "terminating" -> "terminated", "stopping" -> "stopped").""" self.log.debug("instance_state_proceed: instance %s, state %s", instance['id'], instance['state']) state = instance['state'] if state == 'pending': state = 'running' elif state == 'stopping': state = 'stopped' elif state == 'terminating': state = 'terminated' instance['state'] = state self.log.debug("instance_state_proceed: final state %s", state)
for t in token_count_dict: q.put([-token_count_dict[t], t]) token_dict = {} # add special token token_dict[zero_token] = 0 token_dict[unknown_token] = 1 token_dict[start_token] = 2 token_dict[end_token] = 3 token_index = 4 token_count_dict = {} # priority queue while not q.empty(): get = q.get_nowait() if token_index == max_dict_size: break token_dict[get[1]] = token_index token_index += 1 # -------------------------build data pair------------------------------ # write to file with open(file_name) as f: line_count = 0 last_exist = False last_list = [] pair_count = 0 total_token = 0 # one way flag
class AbstractBaseFrontier(object, LoggingMixin): """ A base class for implementing frontiers. Basically this class provides the different general methods and configuration parameters used for frontiers. """ def __init__(self, settings, log_handler, front_end_queues, prioritizer, unique_hash='sha1'): """ Initialize the frontier and instantiate the :class:`SQLiteSingleHostUriQueue`. The default frontier we will use the `sha1` hash function for the unique uri filter. For very large crawls you might want to use a larger hash function (`sha512`, e.g.) """ LoggingMixin.__init__(self, log_handler, settings.LOG_LEVEL_MASTER) # front end queue self._prioritizer = prioritizer self._front_end_queues = front_end_queues # checkpointing self._checkpoint_interval = settings.FRONTIER_CHECKPOINTING self._uris_added = 0 # the heap self._heap = PriorityQueue(maxsize=settings.FRONTIER_HEAP_SIZE) self._heap_min_size = settings.FRONTIER_HEAP_MIN # a list of uris currently being crawled. self._current_uris = dict() # dns cache self._dns_cache = DnsCache(settings) # unique uri filter self._unique_uri = UniqueUriFilter(unique_hash) for url in self._front_end_queues.all_uris(): assert not self._unique_uri.is_known(url, add_if_unknown=True) # the sinks self._sinks = [] # timezone self._timezone = settings.LOCAL_TIMEZONE self._logger.info("frontier::initialized") def add_sink(self, sink): """ Add a sink to the frontier. A sink will be responsible for the long term storage of the crawled contents. """ self._sinks.append(sink) def add_uri(self, curi): """ Add the specified :class:`CrawlUri` to the frontier. `next_date` is a datetime object for the next time the uri should be crawled. Note: time based crawling is never strict, it is generally used as some kind of prioritization. """ if self._unique_uri.is_known(curi.url, add_if_unknown=True): # we already know this uri self._logger.debug("frontier::Trying to update a known uri... " + \ "(%s)" % (curi.url,)) return self._logger.info("frontier::Adding '%s' to the frontier" % curi.url) self._front_end_queues.add_uri(self._uri_from_curi(curi)) self._maybe_checkpoint() def update_uri(self, curi): """ Update a given uri. """ self._front_end_queues.update_uri(self._uri_from_curi(curi)) self._maybe_checkpoint() def get_next(self): """ Return the next uri scheduled for crawling. """ if self._heap.qsize() < self._heap_min_size: self._update_heap() try: (_next_date, next_uri) = self._heap.get_nowait() except Empty: # heap is empty, there is nothing to crawl right now! # maybe log this in the future raise return self._crawluri_from_uri(next_uri) def close(self): """ Close the underlying frontend queues. """ self._front_end_queues.checkpoint() self._front_end_queues.close() def _add_to_heap(self, uri, next_date): """ Add an URI to the heap that is ready to be crawled. """ self._heap.put_nowait((next_date, uri)) (url, _etag, _mod_date, _next_date, _prio) = uri self._current_uris[url] = uri self._logger.debug("frontier::Adding '%s' to the heap" % url) def _reschedule_uri(self, curi): """ Return the `next_crawl_date` for :class:`CrawlUri`s. """ (prio, delta) = self._prioritizer.calculate_priority(curi) now = datetime.now(self._timezone) return (prio, time.mktime((now + delta).timetuple())) def _ignore_uri(self, curi): """ Ignore a :class:`CrawlUri` from now on. """ self._front_end_queues.ignore_uri(curi.url, curi.status_code) def _uri_from_curi(self, curi): """ Create the uri tuple from the :class:`CrawlUri` and calculate the priority. Overwrite this method in more specific frontiers. """ etag = mod_date = None if curi.rep_header: if "Etag" in curi.rep_header: etag = curi.rep_header["Etag"] if "Last-Modified" in curi.rep_header: mod_date = time.mktime(deserialize_date_time( curi.rep_header["Last-Modified"]).timetuple()) if not mod_date and 'Date' in curi.rep_header: mod_date = time.mktime(deserialize_date_time( curi.rep_header["Date"]).timetuple()) if mod_date: # only reschedule if it has been crawled before (prio, next_crawl_date) = self._reschedule_uri(curi) else: (prio, next_crawl_date) = (1, time.mktime(datetime.now(self._timezone).timetuple())) return (curi.url, etag, mod_date, next_crawl_date, prio) def _crawluri_from_uri(self, uri): """ Convert an URI tuple to a :class:`CrawlUri`. Replace the hostname with the real IP in order to cache DNS queries. """ (url, etag, mod_date, _next_date, prio) = uri parsed_url = urlparse(url) # dns resolution and caching port = parsed_url.port if not port: port = PROTOCOLS_DEFAULT_PORT[parsed_url.scheme] effective_netloc = self._dns_cache["%s:%s" % (parsed_url.hostname, port)] curi = CrawlUri(url) curi.effective_url = url.replace(parsed_url.netloc, "%s:%s" % effective_netloc) curi.current_priority = prio curi.req_header = dict() if etag: curi.req_header["Etag"] = etag if mod_date: mod_date_time = datetime.fromtimestamp(mod_date) curi.req_header["Last-Modified"] = serialize_date_time( mod_date_time) curi.optional_vars = dict() if parsed_url.username and parsed_url.password: curi.optional_vars[CURI_SITE_USERNAME] = \ parsed_url.username.encode() curi.optional_vars[CURI_SITE_PASSWORD] = \ parsed_url.password.encode() return curi def _update_heap(self): """ Abstract method. Implement this in the actual Frontier. The implementation should really only add uris to the heap if they can be downloaded right away. """ pass def _maybe_checkpoint(self, force_checkpoint=False): """ Periodically checkpoint the state db. """ self._uris_added += 1 if self._uris_added > self._checkpoint_interval or force_checkpoint: self._front_end_queues.checkpoint() self._uris_added = 0 def process_successful_crawl(self, curi): """ Called when an URI has been crawled successfully. `curi` is a :class:`CrawlUri` """ self.update_uri(curi) if curi.optional_vars and CURI_EXTRACTED_URLS in curi.optional_vars: for url in curi.optional_vars[CURI_EXTRACTED_URLS].split("\n"): if len(url) > 5 and not self._unique_uri.is_known(url): self.add_uri(CrawlUri(url)) del self._current_uris[curi.url] for sink in self._sinks: sink.process_successful_crawl(curi) def process_not_found(self, curi): """ Called when an URL was not found. This could mean, that the URL has been removed from the server. If so, do something about it! Override this method in the actual frontier implementation. """ del self._current_uris[curi.url] self._ignore_uri(curi) for sink in self._sinks: sink.process_not_found(curi) def process_redirect(self, curi): """ Called when there were too many redirects for an URL, or the site has note been updated since the last visit. In the latter case, update the internal uri and increase the priority level. """ del self._current_uris[curi.url] if curi.status_code in [301, 302]: # simply ignore the URL. The URL that is being redirected to is # extracted and added in the processing self._ignore_uri(curi) if curi.status_code == 304: # the page has not been modified since the last visit! Update it # NOTE: prio increasing happens in the prioritizer self.update_uri(curi) for sink in self._sinks: sink.process_redirect(curi) def process_server_error(self, curi): """ Called when there was some kind of server error. Override this method in the actual frontier implementation. """ del self._current_uris[curi.url] self._ignore_uri(curi) for sink in self._sinks: sink.process_server_error(curi)
class MCMH(object): ''' A generic searching algorithm that samples from the distribution of the scores in accordance with the algorithm's belief in the viability of that region. Supports asynchronous updating (i.e., it is possible to draw sequential samples without updating the object's knowledge about the distribution) Note that this will has a min_dist for sampling, such that if the next largest sampled thumbnail by frameno is closer than min_dist, it will not draw that sample. ''' def __init__(self, elements, search_interval, clip=None): ''' elements: the number of elements to search over. search_interval: The number of frames between search frames plus the start frame. clip: how much of the bookends of the region to ignore, as a fraction. NOTES: Search interval is the number of frames between the search frames. In this diagram, we have search interval of 4, and search frame j, and search step (not surfaced to mcmh) of 2. ... j-1 j j+1 j+2 j+3 j+4 j+5 j+6 ... ^ ^ search frames ^ ^ search step frames |----search interval-----| * indicate frames that will be processed during the conducting of a local search. ''' self.search_interval = search_interval self.clip = clip self.elements = elements self._lock = threading.Lock() self._setup() def _setup(self): ''' Allocates all the required memory and things. ''' N = self.elements c = self.clip intr = self.search_interval - 1 start = int(c * N) stop = int(N - (c * N)) search_frames = np.arange(start, stop, intr + 1).astype(int) self._tot = 0. # sum of scores self.n_samples = 0. self._n = 0. # total scored self._first = search_frames[0] self._last = search_frames[-1] # search frame to frame number dictionary self._sf2fno = {n: v for n, v in enumerate(search_frames)} # frame number to search frame dictionary self._fno2sf = {v: k for k, v in self._sf2fno.iteritems()} self._scores = [] # list of frames and scores, sorted by frameno. self._scored = [False] * len( search_frames) # whether or not frame has been scored self._srt_scores = [] # list of scores, sorted by the score self._search_queue = PriorityQueue() self._sample_queue = range(len(search_frames)) self.max_samps = len(search_frames) self._up_next = None # for ensuring search intervals are produced. @property def _mean(self): return self._tot / max(self._n, 1.) def update(self, frameno, score): with self._lock: self._update(frameno, score) def _update(self, frameno, score): ''' Updates the knowledge of the algorithm. A score of 'None' indicates there was a problem with this search frame. ''' sf = self._fno2sf.get(frameno, None) if sf is None: # That is not a valid search frame. _log.warn('Invalid search frame.') return if self._scored[sf]: # you've already sampled this frame. _log.debug('Sample has already been scored.') return insort(self._scores, (sf, score)) # we have to keep track of which scores were actually # updated in case we get a score of 0. self._scored[sf] = True if frameno < self._last: if self._scored[sf + 1]: # then you can search it! # add it to the search queue est = (self._get_score(sf) + self._get_score(sf + 1)) * 0.5 self._search_queue.put((-est, sf)) if frameno > self._first: if self._scored[sf - 1]: # then you can search it! # add it to the search queue est = (self._get_score(sf - 1) + self._get_score(sf)) * 0.5 self._search_queue.put((-est, sf - 1)) insort(self._srt_scores, score) self._tot += score self._n += 1 self.n_samples += 1 _log.debug('Sampling %.1f%% complete', self.n_samples * 100. / self.max_samps) def get_search(self): ''' Returns an interval to search. ''' try: item = self._search_queue.get_nowait() except Empty: return sf = item[1] f1 = self._sf2fno[sf] f2 = self._sf2fno[sf + 1] s1 = self._get_score(sf) s2 = self._get_score(sf + 1) return (f1, s1, f2, s2) def get_sample(self): with self._lock: return self._get_sample() def _get_sample(self): ''' Returns a frame to search. ''' if self._up_next is not None: # then return that to complete a local search interval sample = self._up_next self._up_next = None return self._sf2fno[sample] if not len(self._sample_queue): _log.debug_n('Sampling complete.') return None # there is nothing left to sample. while True: sf = int(np.random.choice(self._sample_queue)) isc = self._interp_score(sf) rnk = (1 + float(bisect_left(self._srt_scores, isc))) / ( 1 + len(self._srt_scores)) if np.random.rand() < rnk: # then take the sample break self._sample_queue.remove(sf) if (sf + 1) in self._sample_queue: self._up_next = sf + 1 self._sample_queue.remove(sf + 1) return self._sf2fno[sf] def _find_lt(self, sf): 'Find the closest earlier frameno to sf' i = bisect_left(self._scores, (sf, 0.)) if i: return self._scores[i - 1] return (-1, self._mean) def _find_gt(self, sf): 'Find closest later frameno to sf' i = bisect_right(self._scores, (sf, 0)) if i != len(self._scores): return self._scores[i] return (len(self._scored), self._mean) def _get_score(self, sf): 'Locate the leftmost value exactly equal to x' i = bisect_left(self._scores, (sf, -np.inf)) if i != len(self._scores) and self._scores[i][0] == sf: sf, score = self._scores[i] return score _log.exception('Could not locate score for search frame %i' % sf) raise ValueError('Could not locate the score for %i' % sf) def _interp_score(self, sf): ''' Returns the interpolated score for a search frame. ''' x1, y1 = self._find_lt(sf) x2, y2 = self._find_gt(sf) x3 = sf m = float(y2 - y1) / float(x2 - x1) return m * (x3 - x1) + y1
from Queue import PriorityQueue A = [] next_a = 3 next_a_s = next_a * next_a pq = PriorityQueue() pq.put_nowait((2, (2, 1))) while pq.not_empty: ans, (a, n) = pq.get_nowait() # print 'checking: {}^{} = {}'.format(a, n, ans) if ans > next_a: pq.put_nowait((ans, (a, n))) pq.put_nowait((next_a_s, (next_a, 2))) next_a += 1 next_a_s = next_a * next_a else: if len(str(ans)) > 1 and a == sum(map(int, str(ans))): A.append(ans) print '{}^{} = {}'.format(a, n, ans) pq.put_nowait((ans * a, (a, n + 1)))
print MAP_NFILES_DIR[max_nfiles] nfiles = 0 for k in MAP_NFILES_DIR: nfiles += k * len(MAP_NFILES_DIR[k]) print 'we found',nfiles,'files in total' print 'average number of files per leaf:',nfiles * 1. / n_leaves # tmp files ntmpfiles = len( get_all_files(maindir,ext='.h5_tmp') ) print 'we found',ntmpfiles,'temp files' if ntmpfiles > 0: print 'WATCHOUT FOR TMP FILES!!!!' # find modif date for all files, and pop out the most recent ones get_all_files_modif_date(maindir) print '******************************************************' if not trim and not trimdryrun: print 'most recent files are:' for k in range(5): t,f = MODIFQUEUE.get_nowait() print f,'(',time.ctime(-t),')' elif trim or trimdryrun: ntoomany = nfiles - 1000000 print 'we have',ntoomany,'too many files.' for k in range(ntoomany): t,f = MODIFQUEUE.get_nowait() print f,'(',time.ctime(-t),')' if trim: os.remove(f) # done print '******************************************************'
print MAP_NFILES_DIR[max_nfiles] nfiles = 0 for k in MAP_NFILES_DIR: nfiles += k * len(MAP_NFILES_DIR[k]) print 'we found', nfiles, 'files in total' print 'average number of files per leaf:', nfiles * 1. / n_leaves # tmp files ntmpfiles = len(get_all_files(maindir, ext='.h5_tmp')) print 'we found', ntmpfiles, 'temp files' if ntmpfiles > 0: print 'WATCHOUT FOR TMP FILES!!!!' # find modif date for all files, and pop out the most recent ones get_all_files_modif_date(maindir) print '******************************************************' if not trim and not trimdryrun: print 'most recent files are:' for k in range(5): t, f = MODIFQUEUE.get_nowait() print f, '(', time.ctime(-t), ')' elif trim or trimdryrun: ntoomany = nfiles - 1000000 print 'we have', ntoomany, 'too many files.' for k in range(ntoomany): t, f = MODIFQUEUE.get_nowait() print f, '(', time.ctime(-t), ')' if trim: os.remove(f) # done print '******************************************************'
import Queue from Queue import PriorityQueue queue = PriorityQueue(maxsize = 100) queue.put((1, 1, "item 1")) queue.put((1, 1, "item 2")) queue.put((1, 1, "item 3")) queue.put((1, 1, "item 3")) print queue.get() print queue.get() print queue.get() try: 2 /0 print queue.get_nowait() except Queue.Empty: print "empty" except ZeroDivisionError: print 'zero' except: print 'other'
class AbstractBaseFrontier(object, LoggingMixin): """ A base class for implementing frontiers. Basically this class provides the different general methods and configuration parameters used for frontiers. """ def __init__(self, settings, log_handler, front_end_queues, prioritizer, unique_hash='sha1'): """ Initialize the frontier and instantiate the :class:`SQLiteSingleHostUriQueue`. The default frontier we will use the `sha1` hash function for the unique uri filter. For very large crawls you might want to use a larger hash function (`sha512`, e.g.) """ LoggingMixin.__init__(self, log_handler, settings.LOG_LEVEL_MASTER) # front end queue self._prioritizer = prioritizer self._front_end_queues = front_end_queues # checkpointing self._checkpoint_interval = settings.FRONTIER_CHECKPOINTING self._uris_added = 0 # the heap self._heap = PriorityQueue(maxsize=settings.FRONTIER_HEAP_SIZE) self._heap_min_size = settings.FRONTIER_HEAP_MIN # a list of uris currently being crawled. self._current_uris = dict() # dns cache self._dns_cache = DnsCache(settings) # unique uri filter self._unique_uri = UniqueUriFilter(unique_hash) for url in self._front_end_queues.all_uris(): assert not self._unique_uri.is_known(url, add_if_unknown=True) # the sinks self._sinks = [] # timezone self._timezone = settings.LOCAL_TIMEZONE self._logger.info("frontier::initialized") def add_sink(self, sink): """ Add a sink to the frontier. A sink will be responsible for the long term storage of the crawled contents. """ self._sinks.append(sink) def add_uri(self, curi): """ Add the specified :class:`CrawlUri` to the frontier. `next_date` is a datetime object for the next time the uri should be crawled. Note: time based crawling is never strict, it is generally used as some kind of prioritization. """ if self._unique_uri.is_known(curi.url, add_if_unknown=True): # we already know this uri self._logger.debug("frontier::Trying to update a known uri... " + \ "(%s)" % (curi.url,)) return self._logger.info("frontier::Adding '%s' to the frontier" % curi.url) self._front_end_queues.add_uri(self._uri_from_curi(curi)) self._maybe_checkpoint() def update_uri(self, curi): """ Update a given uri. """ self._front_end_queues.update_uri(self._uri_from_curi(curi)) self._maybe_checkpoint() def get_next(self): """ Return the next uri scheduled for crawling. """ if self._heap.qsize() < self._heap_min_size: self._update_heap() try: (_next_date, next_uri) = self._heap.get_nowait() except Empty: # heap is empty, there is nothing to crawl right now! # maybe log this in the future raise return self._crawluri_from_uri(next_uri) def close(self): """ Close the underlying frontend queues. """ self._front_end_queues.checkpoint() self._front_end_queues.close() def _crawl_now(self, uri): """ Convinience method for crawling an uri right away. """ self._add_to_heap(uri, 3000) def _add_to_heap(self, uri, next_date): """ Add an URI to the heap that is ready to be crawled. """ self._heap.put_nowait((next_date, uri)) (url, _etag, _mod_date, _next_date, _prio) = uri self._current_uris[url] = uri self._logger.debug("frontier::Adding '%s' to the heap" % url) def _reschedule_uri(self, curi): """ Return the `next_crawl_date` for :class:`CrawlUri`s. """ (prio, delta) = self._prioritizer.calculate_priority(curi) now = datetime.now(self._timezone) return (prio, time.mktime((now + delta).timetuple())) def _ignore_uri(self, curi): """ Ignore a :class:`CrawlUri` from now on. """ self._front_end_queues.ignore_uri(curi.url, curi.status_code) def _uri_from_curi(self, curi): """ Create the uri tuple from the :class:`CrawlUri` and calculate the priority. Overwrite this method in more specific frontiers. """ etag = mod_date = None if curi.rep_header: if "Etag" in curi.rep_header: etag = curi.rep_header["Etag"] if "Last-Modified" in curi.rep_header: mod_date = time.mktime( deserialize_date_time( curi.rep_header["Last-Modified"]).timetuple()) if not mod_date and 'Date' in curi.rep_header: mod_date = time.mktime( deserialize_date_time(curi.rep_header["Date"]).timetuple()) if mod_date: # only reschedule if it has been crawled before (prio, next_crawl_date) = self._reschedule_uri(curi) else: (prio, next_crawl_date) = (1, time.mktime( datetime.now(self._timezone).timetuple())) return (curi.url, etag, mod_date, next_crawl_date, prio) def _crawluri_from_uri(self, uri): """ Convert an URI tuple to a :class:`CrawlUri`. Replace the hostname with the real IP in order to cache DNS queries. """ (url, etag, mod_date, _next_date, prio) = uri parsed_url = urlparse(url) # dns resolution and caching port = parsed_url.port if not port: port = PROTOCOLS_DEFAULT_PORT[parsed_url.scheme] effective_netloc = self._dns_cache["%s:%s" % (parsed_url.hostname, port)] curi = CrawlUri(url) curi.effective_url = url.replace(parsed_url.netloc, "%s:%s" % effective_netloc) curi.current_priority = prio curi.req_header = dict() if etag: curi.req_header["Etag"] = etag if mod_date: mod_date_time = datetime.fromtimestamp(mod_date) curi.req_header["Last-Modified"] = serialize_date_time( mod_date_time) curi.optional_vars = dict() if parsed_url.username and parsed_url.password: curi.optional_vars[CURI_SITE_USERNAME] = \ parsed_url.username.encode() curi.optional_vars[CURI_SITE_PASSWORD] = \ parsed_url.password.encode() return curi def _update_heap(self): """ Abstract method. Implement this in the actual Frontier. The implementation should really only add uris to the heap if they can be downloaded right away. """ pass def _maybe_checkpoint(self, force_checkpoint=False): """ Periodically checkpoint the state db. """ self._uris_added += 1 if self._uris_added > self._checkpoint_interval or force_checkpoint: self._front_end_queues.checkpoint() self._uris_added = 0 def process_successful_crawl(self, curi): """ Called when an URI has been crawled successfully. `curi` is a :class:`CrawlUri` """ self.update_uri(curi) if curi.optional_vars and CURI_EXTRACTED_URLS in curi.optional_vars: for url in curi.optional_vars[CURI_EXTRACTED_URLS].split("\n"): if len(url) > 5 and not self._unique_uri.is_known(url): self.add_uri(CrawlUri(url)) del self._current_uris[curi.url] for sink in self._sinks: sink.process_successful_crawl(curi) def process_not_found(self, curi): """ Called when an URL was not found. This could mean, that the URL has been removed from the server. If so, do something about it! Override this method in the actual frontier implementation. """ del self._current_uris[curi.url] self._ignore_uri(curi) for sink in self._sinks: sink.process_not_found(curi) def process_redirect(self, curi): """ Called when there were too many redirects for an URL, or the site has note been updated since the last visit. In the latter case, update the internal uri and increase the priority level. """ del self._current_uris[curi.url] if curi.status_code in [301, 302]: # simply ignore the URL. The URL that is being redirected to is # extracted and added in the processing self._ignore_uri(curi) if curi.status_code == 304: # the page has not been modified since the last visit! Update it # NOTE: prio increasing happens in the prioritizer self.update_uri(curi) for sink in self._sinks: sink.process_redirect(curi) def process_server_error(self, curi): """ Called when there was some kind of server error. Override this method in the actual frontier implementation. """ del self._current_uris[curi.url] self._ignore_uri(curi) for sink in self._sinks: sink.process_server_error(curi)
class AsyncoreReactor(object): _thread = None _is_live = False logger = logging.getLogger("Reactor") def __init__(self): self._timers = PriorityQueue() self._map = {} def start(self): self._is_live = True self._thread = threading.Thread(target=self._loop, name="hazelcast-reactor") self._thread.daemon = True self._thread.start() def _loop(self): self.logger.debug("Starting Reactor Thread") Future._threading_locals.is_reactor_thread = True while self._is_live: try: asyncore.loop(count=10000, timeout=0.1, map=self._map) self._check_timers() except select.error as err: # TODO: parse error type to catch only error "9" pass except: self.logger.exception("Error in Reactor Thread") # TODO: shutdown client return self.logger.debug("Reactor Thread exited.") def _check_timers(self): now = time.time() while not self._timers.empty(): try: _, timer = self._timers.queue[0] except IndexError: return if timer.check_timer(now): self._timers.get_nowait() else: return def add_timer_absolute(self, timeout, callback): timer = Timer(timeout, callback, self._cleanup_timer) self._timers.put_nowait((timer.end, timer)) return timer def add_timer(self, delay, callback): return self.add_timer_absolute(delay + time.time(), callback) def shutdown(self): for connection in self._map.values(): try: connection.close(HazelcastError("Client is shutting down")) except OSError, connection: if connection.args[0] == socket.EBADF: pass else: raise self._map.clear() self._is_live = False self._thread.join()