def copy_job(self, max_keys=1000): logging.info( 'start copy_bucket' ) src = self.job['source'] tgt = self.job['target'] conn = self.get_conn( tgt['owner'] ) srcBucket = conn.get_bucket( src['bucket'] ) tgtBucket = conn.get_bucket( tgt['bucket'] ) if self.job['options']['allow-acl-change']: ownerBucketView = self.get_conn( src['owner'] ).get_bucket( src['bucket'] ) ownerID = self.users[ tgt['owner'] ]['canonical-id'] else: ownerBucketView = None ownerID = None resultMarker = '' q = LifoQueue(maxsize=5000) for i in range(self.parallel): logging.info( 'adding worker %d' % i ) t = BucketCopyWorker(q, srcBucket, tgtBucket, src['key-prefix'], tgt['key-prefix'], ownerBucketView, ownerID) t.daemon = True t.start() while True: logging.info( 'fetch next 1000, backlog currently at %i' % q.qsize() ) keys = srcBucket.get_all_keys( prefix=src['key-prefix'], max_keys=max_keys, marker = resultMarker) for k in keys: q.put(k.key) if len(keys) < max_keys: print 'Done' break resultMarker = keys[maxKeys - 1].key q.join() logging.info( 'done copy_bucket' )
def copyBucket(maxKeys=1000): print 'start' s_conn = S3Connection(source_aws_key, source_aws_secret_key) srcBucket = s_conn.get_bucket(srcBucketName) resultMarker = '' q = LifoQueue(maxsize=5000) for i in range(10): print 'adding worker' t = Worker(q) t.daemon = True t.start() while True: print 'fetch next 1000, backlog currently at %i' % q.qsize() keys = srcBucket.get_all_keys(max_keys=maxKeys, marker=resultMarker) for k in keys: q.put(k.key) if len(keys) < maxKeys: print 'Done' break resultMarker = keys[maxKeys - 1].key q.join() print 'done'
def Plan(self, start_config, goal_config): start_time = time.time() if self.visualize and hasattr(self.planning_env, "InitializePlot"): self.planning_env.InitializePlot(goal_config) plan = [] # TODO: Here you will implement the breadth first planner # The return path should be a numpy array # of dimension k x n where k is the number of waypoints # and n is the dimension of the robots configuration space q = LifoQueue() start_id = self.planning_env.discrete_env.ConfigurationToNodeId(start_config) goal_id = self.planning_env.discrete_env.ConfigurationToNodeId(goal_config) found = False q.put(start_id) explored =[start_id] backtrack = {} backtrack[start_id] = None n= 0 while (q.qsize()>0) and not found: current = q.get() successors = self.planning_env.GetSuccessors(current) for successor in successors: if not successor in backtrack: n = n+1 q.put(successor) #explored.append(successor) backtrack[successor] = current if self.visualize: s = self.planning_env.discrete_env.NodeIdToConfiguration(successor) c = self.planning_env.discrete_env.NodeIdToConfiguration(current) self.planning_env.PlotEdge(c,s) if successor == goal_id: found = True break # Shortest Path path = [] path.append(self.planning_env.discrete_env.NodeIdToConfiguration(goal_id)) element = backtrack[goal_id] while element is not None: path.append(self.planning_env.discrete_env.NodeIdToConfiguration(element)) element = backtrack[element] plan = path[::-1] if self.visualize: for i in range(len(path) - 1): self.planning_env.PlotRedEdge(path[i],path[i+1]) print "number of nodes" print n print "time (in seconds):" print time.time()- start_time path_length = 0 for i in range(len(path) - 1): path_length = path_length + self.planning_env.ComputeDistance(self.planning_env.discrete_env.ConfigurationToNodeId(path[i]), self.planning_env.discrete_env.ConfigurationToNodeId(path[i+1])) print "path path_length" print path_length return plan
def copy_bucket(aws_key, aws_secret_key, src_bucket_name, dst_bucket_name): print print 'Start copy of %s to %s' % (src_bucket_name, dst_bucket_name) print max_keys = 1000 conn = S3Connection(aws_key, aws_secret_key) srcBucket = conn.get_bucket(src_bucket_name) result_marker = '' q = LifoQueue(maxsize=5000) for i in range(20): print 'Adding worker thread %s for queue processing' % i t = Worker(q, i, aws_key, aws_secret_key, src_bucket_name, dst_bucket_name) t.daemon = True t.start() i = 0 while True: print 'Fetch next %s, backlog currently at %s, have done %s' % (max_keys, q.qsize(), i) try: keys = srcBucket.get_all_keys(max_keys=max_keys, marker=result_marker) if len(keys) == 0: break for k in keys: i += 1 q.put(k.key) print 'Added %s keys to queue' % len(keys) if len(keys) < max_keys: print 'All items now in queue' break result_marker = keys[max_keys - 1].key while q.qsize() > (q.maxsize - max_keys): time.sleep(1) # sleep if our queue is getting too big for the next set of keys except BaseException: logging.exception('error during fetch, quitting') break print 'Waiting for queue to be completed' q.join() print print 'Done' print
def graham_scan(points): """ :param points: numpy array of 2-dimensional points :return: Convex hull as another numpy array of points """ ch = LifoQueue() leftmost = points[np.argmin(points[:, 0])] # finding the leftmost point... definitely in CH dtype = [('x', np.float64), ('y', np.float64), ('slope', np.float64)] # preparing a nicer object for sorting cpts = np.zeros(len(points) - 1, dtype=dtype) cpts[:]['x'] = points[1:, 0] cpts[:]['y'] = points[1:, 1] cpts[:]['slope'] = (cpts[:]['y'] - leftmost[1]) / (cpts[:]['x'] - leftmost[0]) # angle <-> slope from leftmost sorted_pts = np.sort(cpts, order=['slope', 'x']) # sort by angle (slope), then distance from leftmost # shows which points are colinear mask = np.zeros(len(sorted_pts), dtype=bool) # getting rid of points with same angle from leftmost mask = np.logical_not(mask) for i in range(len(sorted_pts[1:])): mask[i - 1] = not sorted_pts[i - 1]['slope'] == sorted_pts[i]['slope'] # only keep farthest away sorted_pts = sorted_pts[mask] sorted_pts[:] = sorted_pts[::-1] # sort greatest slope to lowest (move clockwise) pts = np.zeros((len(sorted_pts) + 1, 2)) # putting leftmost back into a new array object pts[1:, 0] = sorted_pts[:]['x'] pts[1:, 1] = sorted_pts[:]['y'] pts[0] = leftmost ch.put(pts[0]) # leftmost and the point with the highest slope are in the CH for sure ch.put(pts[1]) for i, pt in enumerate(pts): if i < 2: continue else: last = ch.get() second_to_last = ch.get() side = which_side(second_to_last, pts[i], last) # Less than 0 => on the left, o/w on the right while side > 0: # if last point put in on right side, it must have been wrong to be in CH last = second_to_last second_to_last = ch.get() side = which_side(second_to_last, pts[i], last) ch.put(second_to_last) ch.put(last) ch.put(pt) return np.array([ch.get() for i in range(ch.qsize())]) # Put the queue into an array
def iterative(path, path_data): stack=LifoQueue() while 1: if not (type(path_data) == dict and path_data): changes.append(self.store_one(path, path_data)) else: for node in path_data: node_path = path + '/' + node node_data = path_data[node] change = self.store_one(node_path, node_data) changes.append(change) if type(node_data) == type(dict()): stack.put([node_path, node_data]) if stack.qsize(): path,path_data=stack.get() continue; break;
def process_transaction(self, transaction_id): stack = LifoQueue() tasks = self.storage.get_tasks(transaction_id) logger.debug(tasks) for i, task in enumerate(tasks): try: task = Task(task) task.run() self.storage.set_task_processed(transaction_id, i, True) stack.put(task) except: logger.critical(format_exc()) self.storage.set_task_processed(transaction_id, i, False) while stack.qsize(): task = stack.get() task.reverse() return { 'error': True, 'processed': i, } return { 'success': True }
def stage_one(skel_img, dt, anisotropy): """stage one, finds all nodes and edges, except for loops""" # initializing volume = deepcopy(skel_img) is_queued_map = np.zeros(volume.shape, dtype=int) is_node_map = np.zeros(volume.shape, dtype=int) is_term_map = np.zeros(volume.shape, dtype=int) nodes = {} edges = [] last_node = 1 current_node = 1 queue = LifoQueue() point = init(volume) loop_list = [] branch_point_list = [] node_list = [] length = 0 if (point == np.array([-1, -1, -1])).all(): return is_node_map, is_term_map, nodes, edges, loop_list is_queued_map[point[0], point[1], point[2]] = 1 not_queued, is_node_list, are_near = check_box(volume, point, is_queued_map, is_node_map) nodes[current_node] = point while len(not_queued) == 0: volume[point[0], point[1], point[2]] = 0 is_queued_map[point[0], point[1], point[2]] = 0 nodes = {} point = init(volume) if (point == np.array([-1, -1, -1])).all(): return is_node_map, is_term_map, nodes, edges,loop_list is_queued_map[point[0], point[1], point[2]] = 1 not_queued, is_node_list, are_near = check_box(volume, point, is_queued_map, is_node_map) nodes[current_node] = point for i in not_queued: queue.put(np.array([i, current_node, length, [[point[0], point[1], point[2]]], [dt[point[0], point[1], point[2]]]])) is_queued_map[i[0], i[1], i[2]] = 1 if len(not_queued) == 1: is_term_map[point[0], point[1], point[2]] = last_node is_node_map[point[0], point[1], point[2]] = last_node else: is_node_map[point[0], point[1], point[2]] = last_node while queue.qsize(): # pull item from queue point, current_node, length, edge_list, dt_list = queue.get() not_queued, is_node_list, are_near = check_box(volume, point, is_queued_map, is_node_map) # if current_node==531: # print "hi" # print "hi" # standart point if len(not_queued) == 1: dt_list.extend([dt[point[0], point[1], point[2]]]) edge_list.extend([[point[0], point[1], point[2]]]) length = length + norm3d(point,not_queued[0]) queue.put(np.array([not_queued[0], current_node, length, edge_list, dt_list])) is_queued_map[not_queued[0][0], not_queued[0][1], not_queued[0][2]] = 1 branch_point_list.extend([[point[0], point[1], point[2]]]) elif len(not_queued) == 0 and (len(are_near) > 1 or len(is_node_list) > 0): loop_list.extend([current_node]) # terminating point elif len(not_queued) == 0 and len(are_near) == 1 and len(is_node_list) == 0: last_node = last_node + 1 nodes[last_node] = point dt_list.extend([dt[point[0], point[1], point[2]]]) edge_list.extend([[point[0], point[1], point[2]]]) node_list.extend([[point[0], point[1], point[2]]]) edges.extend([[np.array([current_node, last_node]), length, edge_list, dt_list]]) is_term_map[point[0], point[1], point[2]] = last_node is_node_map[point[0], point[1], point[2]] = last_node # branch point elif len(not_queued) > 1: dt_list.extend([dt[point[0], point[1], point[2]]]) edge_list.extend([[point[0], point[1], point[2]]]) last_node = last_node + 1 nodes[last_node] = point # build edge edges.extend([[np.array([current_node, last_node]), length, edge_list, dt_list]]) node_list.extend([[point[0], point[1], point[2]]]) # putting node branches in the queue for x in not_queued: length = norm3d(point,x) queue.put(np.array([x, last_node, length, [[point[0], point[1], point[2]]], [dt[point[0], point[1], point[2]]]])) is_queued_map[x[0], x[1], x[2]] = 1 is_node_map[point[0], point[1], point[2]] = last_node return is_node_map, is_term_map, nodes, edges, loop_list
class ThreadPool(object): def __init__(self, threadNum, max_tasks_per_period=10, seconds_per_period=30): self.pool = [] # 线程池 self.threadNum = threadNum # 线程数 self.runningLock = Lock() # 线程锁 self.taskLock = Lock() # getTask函数的锁 self.running = 0 # 正在run的线程数 # 设置为LIFO队列:在抓取了第一个post的页面后,随后需要添加所有其后的评论页, # 使用LIFO队列可以保证尽快将第一个post的所有评论抓取到,并存储 self.taskQueue = LifoQueue() # 任务队列 # 一分钟内允许的最大访问次数 self.max_tasks_per_period = max_tasks_per_period # 定制每分钟含有的秒数 self.seconds_per_period = seconds_per_period # 当前周期内已经访问的网页数量 self.currentPeriodVisits = 0 # 将一分钟当作一个访问周期,记录当前周期的开始时间 self.periodStart = time.time() # 使用当前时间初始化 def startThreads(self): """Create a certain number of threads and started to run All Workers share the same ThreadPool """ # 开始当前的抓取周期 self.periodStart = time.time() for i in range(self.threadNum): self.pool.append(Worker(self, i)) def stopThreads(self): for thread in self.pool: thread.stop() thread.join() del self.pool[:] def putTask(self, func, *args, **kargs): self.taskQueue.put((func, args, kargs)) def getTask(self, *args, **kargs): # 进行访问控制: 判断当前周期内访问的网页数目是否大于最大数目 if self.currentPeriodVisits >= self.max_tasks_per_period - 2: timeNow = time.time() seconds = timeNow - self.periodStart if seconds < self.seconds_per_period: # 如果当前还没有过一分钟,则sleep remain = self.seconds_per_period - seconds print "ThreadPool Waiting for " + str(remain) + " seconds." time.sleep(int(remain + 1)) self.periodStart = time.time() # 重新设置开始时间 self.currentPeriodVisits = 0 try: # task = self.taskQueue.get(*args, **kargs) task = self.taskQueue.get_nowait() except Empty: return (None, None, None) self.currentPeriodVisits += 1 return task def taskJoin(self, *args, **kargs): """Queue.join: Blocks until all items in the queue have been gotten and processed. """ self.taskQueue.join() def taskDone(self, *args, **kargs): self.taskQueue.task_done() def increaseRunsNum(self): self.runningLock.acquire() self.running += 1 # 正在运行的线程数加1 self.runningLock.release() def decreaseRunsNum(self): self.runningLock.acquire() self.running -= 1 self.runningLock.release() def getTaskLeft(self): # 线程池的所有任务包括: # taskQueue中未被下载的任务, resultQueue中完成了但是还没被取出的任务, 正在运行的任务 # 因此任务总数为三者之和 return self.taskQueue.qsize() + self.running
pathWorker.setDaemon(True) pathWorker.start() # now the worker threads are processing lets feed the fileQueue, this will block if the # rework file is larger than the queue. primeQueues(fileQueue, dirQueue) if args.debug: queueMsg("\"max\", \"file\", \"dir\", \"results\"") # lets just hang back and wait for the queues to empty print "If you need to pause this job, press Ctrl-C once" time.sleep(1) while not terminateThreads: if args.debug: queueMsg("\"%s\", \"%s\", \"%s\", \"%s\"\n"%(args.queueParams['max'], fileQueue.qsize(), dirQueue.qsize(), resultsQueue.qsize())) time.sleep(.1) if fileQueue.empty() and dirQueue.empty(): queueMsg("\"%s\", \"%s\", \"%s\", \"%s\"\n"%(args.queueParams['max'], fileQueue.qsize(), dirQueue.qsize(), resultsQueue.qsize())) print "waiting for directory queue to clear..." dirQueue.join() print "waiting for file queue to clear..." fileQueue.join() print "waiting for worker processes to complete..." terminateThreads = True print "waiting for results queue to clear..." resultsQueue.join() print "exporting statistics..." exportStats() print "closing files..."
# Python program to # demostrate stack implementation # using queue module from Queue import LifoQueue # Initializing a stack stack = LifoQueue(max_size=3) # qsize() show the number of elements # in the stack print(stack.qsize()) # put() function to push # element in the stack stack.put('a') stack.put('b') stack.put('c') print("Full: ", stack.full()) print("Size: ", stack.qsize()) # get() function to pop # element from stack in # LIFO order print('\nElement poped from the stack') print(stack.get()) print(stack.get()) print(stack.get()) print("\nEmpty: ", stack.empty())
def copy_bucket(aws_key, aws_secret_key, src, dst): max_keys = 1000 conn = S3Connection(aws_key, aws_secret_key) try: (src_bucket_name, src_path) = src.split('/', 1) except ValueError: src_bucket_name = src src_path = None try: (dst_bucket_name, dst_path) = dst.split('/', 1) except ValueError: dst_bucket_name = dst dst_path = None if dst_path is not None: raise ValueError("not currently implemented to set dest path; must use default, which will mirror the source") src_bucket = conn.get_bucket(src_bucket_name) print print 'Start copy of %s to %s' % (src, dst) print result_marker = '' q = LifoQueue(maxsize=5000) for i in range(20): print 'Adding worker thread %s for queue processing' % i t = Worker(q, i, aws_key, aws_secret_key, src_bucket_name, dst_bucket_name, src_path, dst_path) t.daemon = True t.start() i = 0 while True: print 'm (%s): Fetch next %s, backlog currently at %s, have done %s' % (src_path, max_keys, q.qsize(), i) try: keys = src_bucket.get_all_keys(max_keys=max_keys, marker=result_marker, prefix=src_path or '') if len(keys) == 0: break for k in keys: i += 1 q.put(k.key) # print 'Added %s keys to queue' % len(keys) if len(keys) < max_keys: print 'All items now in queue' break result_marker = keys[max_keys - 1].key while q.qsize() > (q.maxsize - max_keys): time.sleep(1) # sleep if our queue is getting too big for the next set of keys except BaseException: logging.exception('error during fetch, quitting') break print 'm (%s): Waiting for queue to be completed' % (src_path) q.join() print print 'm (%s): Done' % (src_path) print
if __name__=='__main__': post_db=DB('Post') picture_db=DB('Picture') url_queue=LifoQueue() result_queue=LifoQueue() if len(sys.argv)==1: for key in list_pattern.keys(): url_queue.put(host+list_pattern[key]['page1']) else: for key in list_pattern.keys(): max_page=get_max_page(list_pattern[key]['page1']) url_queue.put(host+list_pattern[key]['page1']) for page in range(2,max_page+1): url_queue.put((host+list_pattern[key]['page']).format(page=page)) print('{} pages waiting for parse'.format(url_queue.qsize())) tasks=[] while 1: url=url_queue.get() t=Thread(target=get_pid,args=(url,)) tasks.append(t) if url_queue.empty(): break for task in tasks: task.start() for task in tasks: task.join() while 1: url,title,cate=result_queue.get() id=id_reg.findall(url)[0] poster='http://img1.mm131.me/pic/{}/0.jpg'.format(id)
def copy_bucket(aws_key, aws_secret_key, args): max_keys = 1000 src = args.src_bucket dst = args.dest_bucket conn = S3Connection(aws_key, aws_secret_key) try: (src_bucket_name, src_path) = src.split('/', 1) except ValueError: src_bucket_name = src src_path = None try: (dst_bucket_name, dst_path) = dst.split('/', 1) except ValueError: dst_bucket_name = dst dst_path = None src_bucket = conn.get_bucket(src_bucket_name) if args.verbose: print print 'Start copy of %s to %s' % (src, dst) print result_marker = '' q = LifoQueue(maxsize=5000) for i in xrange(args.threads_no): if args.verbose: print 'Adding worker thread %s for queue processing' % i t = Worker(q, i, aws_key, aws_secret_key, src_bucket_name, dst_bucket_name, src_path, dst_path, args) t.daemon = True t.start() i = 0 while True: if args.verbose: print 'Fetch next %s, backlog currently at %s, have done %s' % \ (max_keys, q.qsize(), i) try: keys = src_bucket.get_all_keys(max_keys=max_keys, marker=result_marker, prefix=src_path or '') if len(keys) == 0: break for k in keys: i += 1 q.put(k.key) if args.verbose: print 'Added %s keys to queue' % len(keys) if len(keys) < max_keys: if args.verbose: print 'All items now in queue' break result_marker = keys[max_keys - 1].key while q.qsize() > (q.maxsize - max_keys): time.sleep(1) # sleep if our queue is getting too big for the next set of keys except BaseException: logging.exception('error during fetch, quitting') break if args.verbose: print 'Waiting for queue to be completed' q.join() if args.verbose: print print 'Done' print
class B2BucketThreadedLocal(B2Bucket): def __init__(self, *args): super(B2BucketThreaded, self).__init__( *args) num_threads=50 self.queue = LifoQueue(num_threads*2) self.file_locks = defaultdict(Lock) self.running = True self.threads = [] print "Thread ", for i in xrange(num_threads): t = threading.Thread(target=self._file_updater) t.start() self.threads.append(t) print ".", print self.pre_queue_lock = Lock() self.pre_queue_running = True self.pre_queue = LifoQueue(num_threads*2) self.pre_file_dict = {} self.pre_thread = threading.Thread(target=self._prepare_update) self.pre_thread.start() def _prepare_update(self): while self.pre_queue_running: try: filename, local_filename, operation = self.pre_queue.get(True,1) self.pre_file_dict[filename] = (time(), local_filename, operation) self.pre_queue.task_done() except Empty: for filename, (timestamp, local_filename, operation) in self.pre_file_dict.items(): if time()-timestamp > 15: self.queue.put((filename, local_filename, operation)) del self.pre_file_dict[filename] for filename, (timestamp, local_filename, operation) in self.pre_file_dict.items(): self.queue.put((filename, local_filename, operation)) del self.pre_file_dict[filename] def _file_updater(self): while self.running: try: filename, local_filename, operation = self.queue.get(True,1) except Empty: continue with self.file_locks[filename]: if operation == "deletion": super(B2BucketThreaded,self)._delete_file(filename) self.queue.task_done() elif operation == "upload": super(B2BucketThreaded,self)._put_file(filename, local_filename) self.queue.task_done() elif operation == "download": super(B2BucketThreaded,self)._get_file(filename, local_filename) self.queue.task_done() else: self.logger.error("Invalid operation %s on %s" % (operation, filename)) def __enter__(self): return self def __exit__(self, *args, **kwargs): self.logger.info("Waiting for all B2 requests to complete") self.logger.info("Pre-Queue contains %s elements", self.pre_queue.qsize()) self.pre_queue.join() self.logger.info("Joining pre queue thread") self.pre_queue_running = False self.pre_thread.join() self.logger.info("Queue contains %s elements", self.queue.qsize()) self.queue.join() self.logger.info("Joining threads") self.running = False for t in self.threads: t.join() def put_file(self, filename, local_filename): with self.pre_queue_lock: self.logger.info("Postponing upload of %s (%s)", filename, len(data)) self.pre_queue.put((filename, local_filename, "upload"), True) new_file = {} new_file['fileName'] = filename new_file['fileId'] = None new_file['uploadTimestamp'] = time() new_file['action'] = 'upload' new_file['contentLength'] = len(data) return new_file def delete_file(self, filename): with self.pre_queue_lock: self.logger.info("Postponing deletion of %s", filename) self.pre_queue.put((filename, None, "deletion"),True) def get_file(self, filename, local_filename): with self.pre_queue_lock: self.logger.info("Postponing download of %s", filename) self.pre_queue.put((filename, local_filename, "download"),True)
def copy_bucket(aws_key, aws_secret_key, args): max_keys = 1000 src = args.src_bucket dst = args.dest_bucket conn = S3Connection(aws_key, aws_secret_key) try: (src_bucket_name, src_path) = src.split('/', 1) except ValueError: src_bucket_name = src src_path = None try: (dst_bucket_name, dst_path) = dst.split('/', 1) except ValueError: dst_bucket_name = dst dst_path = None src_bucket = conn.get_bucket(src_bucket_name) if args.verbose: print print 'Start copy of %s to %s' % (src, dst) print result_marker = '' q = LifoQueue(maxsize=5000) for i in xrange(args.threads_no): if args.verbose: print 'Adding worker thread %s for queue processing' % i t = Worker(q, i, aws_key, aws_secret_key, src_bucket_name, dst_bucket_name, src_path, dst_path, args) t.daemon = True t.start() i = 0 while True: if args.verbose: print 'Fetch next %s, backlog currently at %s, have done %s' % \ (max_keys, q.qsize(), i) try: keys = src_bucket.get_all_keys(max_keys=max_keys, marker=result_marker, prefix=src_path or '') if len(keys) == 0: break for k in keys: i += 1 q.put(k.key) if args.verbose: print 'Added %s keys to queue' % len(keys) if len(keys) < max_keys: if args.verbose: print 'All items now in queue' break result_marker = keys[max_keys - 1].key while q.qsize() > (q.maxsize - max_keys): time.sleep( 1 ) # sleep if our queue is getting too big for the next set of keys except BaseException: logging.exception('error during fetch, quitting') break if args.verbose: print 'Waiting for queue to be completed' q.join() if args.verbose: print print 'Done' print
class Speech_Recognizer: def __init__(self, speech_state_machine): # Speech sampler self.__fs = 16000 self.__sampler_window_duration = 5 # seconds self.__sampler = Speech_Sampler(self.__sampler_window_duration, self.__fs) # Feature builder self.__feature_window_duration = 0.025 # seconds self.__feature_skip_duration = 0.01 # seconds self.__feature_nfilters = 26 self.__feature_nfilters_keep = 13 self.__feature_radius = 2 self.__feature_builder = ASR_Feature_Builder() # Processing self.__ignore_sample = False self.__max_queue_size = 10 self.__pool = multiprocessing.Pool() self.__process_sleep_time = 0.025 # seconds self.__queue_lock = threading.Lock() self.__plot_option = -1 self.__speech_segments = LifoQueue() self.__speech_state_machine = speech_state_machine self.__stop_processing = False self.__feature_builder.set_plot_blocking(True) self.__sampler.add_sample_callback(self.__queue_speech_segment) self.__sampler.hide_spectrogram_plot() self.__sampler.hide_zero_crossing_plot() self.__speech_state_machine.add_speech_match_callback(self.__speech_matched) def __empty_speech_segment_queue(self): self.__queue_lock.acquire(True) while not self.__speech_segments.empty(): self.__speech_segments.get() self.__queue_lock.release() def __get_speech_segment(self): speech_segment = None self.__queue_lock.acquire(True) if not self.__speech_segments.empty(): speech_segment = self.__speech_segments.get() self.__queue_lock.release() return speech_segment def __handle_interactive(self): while not self.__stop_processing: invalid_selection = True while invalid_selection: os.system('cls') message = ( "Please enter the number of the option you wish to execute:\n" " 0) Pause\n" " 1) No plots\n" " 2) Clear data queue\n" " 3) Start save speech segments (saves in current working directory)\n" " 4) Stop save speech segments\n" " 5) Start speech segment playback\n" " 6) Stop speech segment playback\n" " 7) Plot full feature matrix\n" " 8) Plot mfcc feature matrix\n" " 9) Plot delta feature matrix\n" " 10) Plot filter banks\n" " 11) Plot filter bank filtered spectra sum\n" " 12) Plot filter bank filtered spectra sum log\n" " 13) Plot filter bank filtered spectra sum log dct (mfcc)\n" " 14) Plot mfcc transitions\n" " 15) Plot speech segment\n" " 16) Plot viterbi path\n" "\n" "To resume you have to exit the plot cause matplotlib is stupid...\n" ) print message text = raw_input("Enter your selection: ") try: option = int(text) if option > 16: continue else: invalid_selection = False if option == 0: self.__sampler.pause() elif option == 1: self.__plot_option = -1 elif option == 2: self.__empty_speech_segment_queue() elif option == 3: self.__sampler.save_samples(os.getcwd()) elif option == 4: self.__sampler.save_samples(None, False) elif option == 5: self.__sampler.play_samples(True) elif option == 6: self.__sampler.play_samples(False) else: self.__plot_option = option - 7 except ValueError: continue time.sleep(0.1) def __process_speech_segments(self): while not self.__stop_processing: time.sleep(self.__process_sleep_time) speech_segment = self.__get_speech_segment() if speech_segment is None: continue feature_matrix = self.__feature_builder.compute_features_for_signal( \ np.round(speech_segment * np.iinfo(np.int16).max).astype(np.int16), \ self.__fs, \ self.__feature_nfilters, \ self.__feature_window_duration, \ self.__feature_skip_duration, \ self.__feature_radius, \ self.__feature_nfilters_keep) self.__ignore_sample = True self.__speech_state_machine.update(feature_matrix) self.__ignore_sample = False plot_options = ASR_Feature_Builder_Plot_Options( \ self.__feature_builder, \ self.__plot_option, \ self.__feature_nfilters_keep \ ) self.__pool.map(asr_feature_builder_plot, [plot_options]) def __queue_speech_segment(self, speech_segment): if self.__ignore_sample: return self.__queue_lock.acquire(True) if self.__speech_segments.qsize() == self.__max_queue_size: temp_queue = LifoQueue() while not self.__speech_segments.empty(): temp_queue.put(self.__speech_segments.get()) # Discard the oldest data temp_queue.get() while not temp_queue.empty(): self.__speech_segments.put(temp_queue.get()) self.__speech_segments.put(speech_segment) self.__queue_lock.release() def __speech_matched(self, new_hmm, phrase, log_match_probability, is_primary): plot_options = HMM_Plot_Options( \ new_hmm, \ self.__plot_option \ ) self.__pool.map(hmm_plot, [plot_options]) def run(self, interactive = True): if socket.gethostname() == "DESKTOP-NR96827": sd.default.device["output"] = "Speakers (High Definition Audio, MME" else: sd.default.device["output"] = "Speaker/HP (Realtek High Defini, MME" processing_thread = threading.Thread(target = self.__process_speech_segments) if interactive: interactive_thread = threading.Thread(target = self.__handle_interactive) processing_thread.start() if interactive: interactive_thread.start() else: print("Speak to me oh mighty user...") self.__sampler.run(True) self.__stop_processing = True processing_thread.join() if interactive: interactive_thread.join()
class Gazzle(object): def __init__(self, *args, **kwargs): self.sockets = [] mongo_client = MongoClient('localhost', 27017) self.mongo = mongo_client['gazzle'] # self.mongo.drop_collection('pages') self.pages = self.mongo['pages'] self._init_whoosh() self.pageset = {} self.crawl_thread_count = kwargs.get('crawl_threads', 3) self.pending_crawls = 0 self.pending_lock = threading.RLock() self.frontier = Queue() self.crawlCount = 0 self.crawling = False self.crawl_cond = threading.Condition() self.crawl_lock = threading.RLock() self._init_crawl() self.index_set = set() self.index_q = LifoQueue() self.index_altq = LifoQueue() self.index_alt_switchoff = False self.indexing = False self.index_cond = threading.Condition() self.index_lock = threading.RLock() self._init_index() self._index_size() self.crosssite_crawl = False self.pagerank_cond = threading.Condition() self._start_thread(target = self._crawl, count = self.crawl_thread_count) self._start_thread(target = self._index, count = 1) # index writer doesn't support multithreading self._start_thread(target = self._pagerank, count = 1) self._start_thread(target = self._assert_thread, count=1) def _init_crawl(self): self.pageset = {} self.frontier = Queue() for page in self.pages.find(): self.pageset[page['url']] = page['page_id'] for page in self.pages.find({'crawled': False}): self.frontier.put(page['page_id']) self.crawlCount = self.pages.find({'crawled': True}).count() print('Added %d pages to page set' % len(self.pageset)) print('Added %d pages to frontier' % self.frontier.qsize()) print('Crawl count set to %d' % self.crawlCount) def _init_index(self): self.index_set = set() self.index_q = LifoQueue() for page in self.pages.find({'indexed': True}): self.index_set.add(page['page_id']) for page in self.pages.find({'crawled':True, 'indexed': False}): self.index_q.put(page['page_id']) print('Added %d pages to index set' % len(self.index_set)) print('Added %d pages to index queue' % self.index_q.qsize()) def _init_whoosh(self, clear = False): schema = Schema(page_id=STORED, title=TEXT(stored=True), content=TEXT, url=ID(stored=True)) if not os.path.exists("index"): os.mkdir("index") clear = True if clear: self.index = create_in('index', schema) else: self.index = open_dir("index") def _assert_thread(self): while True: a = self.pages.find_one({'crawled': True, 'title': {'$exists': False}}) assert a == None, 'Found inconsistent page in db ID: %d URL: %s' % (a['page_id'], a['url']) time.sleep(1) def _pagerank(self): while True: with self.pagerank_cond: self.pagerank_cond.wait() pages = self.pages.find({'crawled': True, 'indexed': True}, { '_id':False, 'content': False, 'links.url': False }) RANK_SCALE = 1 ALPHA = 0.25 page_count = pages.count() id_to_ind = {} ind_to_id = [] for page in pages: ind = len(id_to_ind) ind_to_id.append(page['page_id']) id_to_ind[page['page_id']] = ind pages.rewind() pmat = [] for page in pages: row = [0.0] * page_count link_count = 0 for link in page['links']: if link['page_id'] in id_to_ind: ind = id_to_ind[link['page_id']] row[ind] += RANK_SCALE link_count += 1 alph = ALPHA * RANK_SCALE / page_count for ind in range(page_count): if link_count == 0: row[ind] += 1 / page_count else: row[ind] *= (1 - alph) / link_count row[ind] += alph / page_count pmat.append(row) page_rank = [0] * page_count page_rank[0] = 1 for d in range(30): page_rank = dot(page_rank, pmat) result = [{"page_id": ind_to_id[x], "rank": self._format_rank(page_rank[x])} for x in range(page_count)] self._send_to_all({ 'action': 'page rank', 'pages': result }) for ind in range(page_count): self.pages.update({"page_id": ind_to_id[ind]}, {"$set": {"rank": page_rank[ind]}}, upsert=False) def _index(self): _ = { 'lock': threading.RLock(), 'writer': None, 'need_commit': [], } def flush(_): while True: if len(_['need_commit']) != 0 and _['writer'] != None: _['lock'].acquire() _['writer'].commit() _['writer'] = None need_tmp = _['need_commit'] _['need_commit'] = [] _['lock'].release() self._send_to_all({ 'action': 'index commit', 'pages': map(lambda x: {'page_id': x}, need_tmp) }) self.pages.update({'page_id' : {'$in': need_tmp}}, {'$set': {'indexed': True}}, multi = True, upsert = False) with self.pagerank_cond: self.pagerank_cond.notify() self._send_to_all({ 'action': 'index size', 'value': self.index_size }) time.sleep(5) self._start_thread(target = flush, kwargs={'_':_}) while True: with self.index_cond: with self.pending_lock: pending = self.pending_crawls != 0 while not self.indexing or pending: self.index_cond.wait() with self.pending_lock: pending = self.pending_crawls != 0 try: item_index = self.index_altq.get(False) if self.index_alt_switchoff: self.indexing = False except: item_index = self.index_q.get(True) if item_index in self.index_set: continue item = self.pages.find_one({'page_id': item_index}) _['lock'].acquire() if _['writer'] == None: _['writer'] = self.index.writer() assert item.get('title') != None , 'Uncrawled page in index queue, ID: %d, URL: %s' %(item['page_id'], item['url']) _['writer'].add_document(page_id=item_index, title=item['title'], content=item['content'], url=item['url']) _['need_commit'].append(item_index) _['lock'].release() self.index_set.add(item_index) self._send_to_all({ 'action': 'index page', 'page': {'page_id': item_index} }) def _crawl(self): with self.pending_lock: self.pending_crawls += 1 while True: with self.pending_lock: self.pending_crawls -= 1 with self.crawl_cond: while not self.crawling: if self.indexing: with self.index_cond: self.index_cond.notify() self.crawl_cond.wait() with self.pending_lock: self.pending_crawls += 1 item_index = self.frontier.get(True) item = self.pages.find_one({'page_id': item_index}) page = urllib2.urlopen(item['url']) soup = BeautifulSoup(page.read()) title = soup.title.text #.replace(' - Wikipedia, the free encyclopedia', '') if len(title) > 12: title = title[:12] + '...' body = soup.body.text links = map(lambda link: self.extract_anchor_link(link, item['url']), soup.find_all("a")) links = filter(lambda link: link != '' and link != None, links) with self.crawl_lock: # links = filter(lambda link: link not in self.pageset, links) print("%s Crawling %s found %d links" % (threading.current_thread().name, item['url'], len(links))) result_links = [] for link in links: if link not in self.pageset: page_id = len(self.pageset) self.pages.insert({ 'page_id': page_id, 'url': link, 'crawled': False, 'indexed': False }) self.pageset[link] = page_id self.frontier.put(page_id) else: page_id = self.pageset[link] result_links.append({'url': link, 'page_id': page_id}) self.crawlCount += 1 self.index_q.put(item_index) self.pages.update({'page_id': item_index}, { '$push': {'links': {'$each': result_links}}, '$set': {'title': unicode(title), 'content': unicode(body), 'crawled': True} }) self._send_to_all(json.dumps([ { 'action': 'crawl page', 'page': {'page_id': item_index, 'url': item['url'], 'link_count': len(links), 'title': title} }, { 'action': 'frontier size', 'value': self.frontier.qsize() }, { 'action': 'crawl size', 'value': self.crawlCount }, ])) def extract_anchor_link(self, link, url): href = link.get('href', '') m = re.match('([^?]+)[?].*', unicode(href)) if m != None: href = m.group(1) if href == '': return '' if 'https://' in href: href = href.replace('https://', 'http://') if re.match('#.*', href) != None: return '' elif re.match('//.*', href): return 'http:' + href elif re.match('/.*', href): m = re.match('(http://[0-9a-zA-Z.]+)/*', url) # print("link %s %s going to %s" % (href, "", "")) return m.group(1) + href elif self.crosssite_crawl: return href return '' def search(self, socket, query, rank_part=0): def sort_results(results): scores = {} max_score = 0 max_rank = 0 for res in results: scores[res.fields()['page_id']] = res.score if res.score > max_score: max_score = res.score page_ids = map(lambda x: x.fields()['page_id'], results) pages = self.pages.find({"page_id": {"$in": page_ids}}, {"title": True, "page_id":True, "rank":True, "url": True}) pages = map(lambda x: dict(x), pages) for page in pages: if 'rank' not in page: page['rank'] = 0 if page['rank'] > max_rank: max_rank = page['rank'] for page in pages: del page['_id'] rank = 1 - page['rank'] / float(max_rank) score = scores[page['page_id']] / float(max_score) final_score = rank * (rank_part / 100.0) + score * (1 - rank_part / 100.0) page['score'] = final_score pages.sort(key = lambda x: x['score']) return pages with self.index.searcher() as searcher: parser = QueryParser("content", self.index.schema) parsed_query = parser.parse(query) results = searcher.search(parsed_query) if len(results) > 0: print("found some") print(len(results)) results = sort_results(results) else: results = [] # results = map(lambda x: dict(x), results) print(results) socket.write_message(json.dumps({ 'action': 'search results', 'results' : results })) def clear_index(self): self._init_whoosh(clear = True) self.pages.update({'indexed': True}, {'$set': {'indexed': False}}, multi = True, upsert = False) self._init_index() self._send_to_all({ 'action': 'index clear' }) def clear_frontier(self): self.pages.remove({'crawled': False}) self._init_crawl() self._send_to_all({ 'action': 'init', 'frontier_size': 0 }) def clear_all(self): self.mongo.drop_collection('pages') self._init_whoosh(clear = True) self._init_index() self._init_crawl() self.indexing = False self.crawling = False self.index_size = 0 self.crosssite_crawl = False self._send_to_all(json.dumps({ 'action': 'init', 'pages': [], 'frontier_size': 0, 'crawl_size': 0, 'index_size': 0, 'crawling': False, 'indexing': False, 'crosssite_crawl': False })) def _format_rank(self, rank): if rank == None: return None return "%.2f" % (math.log(rank + 1) * 100) def _send_to_all(self, message): if type(message) != str: message = json.dumps(message) for socket in self.sockets: socket.write_message(message) def _start_thread(self, target, count=1, args=(), kwargs={}): for x in range(count): thread = threading.Thread(target=target, args=args, kwargs=kwargs) thread.setDaemon(True) thread.start() def _index_size(self): self.index_size = sum(os.path.getsize('index/'+f) for f in os.listdir('index') if os.path.isfile('index/'+f)) print("Index Size: %d" % self.index_size) return self.index_size def add_socket(self, socket): self.sockets.append(socket) pages = self.pages.find({'crawled': True}, {'_id': False, 'page_id':True, 'url': True, 'title': True, 'indexed': True, 'rank': True}) pages = map(lambda x: {'page_id': x['page_id'], 'title': x['title'], 'url': x['url'], 'indexed': x['indexed'], 'rank': self._format_rank(x.get('rank'))}, pages) socket.write_message(json.dumps({ 'action': 'init', 'pages': pages, 'frontier_size': self.frontier.qsize(), 'crawl_size': self.crawlCount, 'index_size': self.index_size, 'crawling': self.crawling, 'indexing': self.indexing, 'crosssite_crawl': self.crosssite_crawl })) def remove_socket(self, socket): self.sockets.remove(socket) def start_crawl(self, url=''): if url == '': url = 'http://en.wikipedia.org/wiki/Information_retrieval' with self.crawl_lock: page_id = len(self.pageset) self.pages.insert({ 'page_id': page_id, 'url': url, 'crawled': False, 'indexed': False }) self.frontier.put(len(self.pageset)) self.pageset[url] = page_id self.toggle_crawl(state = True) def toggle_crawl(self, state=None): with self.crawl_cond: if state == None: self.crawling = not self.crawling else: self.crawling = state self.crawl_cond.notifyAll() self._send_to_all({ 'action': 'init', 'crawling': self.crawling }) def toggle_index(self, state=None): with self.index_cond: if state == None: self.indexing = not self.indexing else: self.indexing = state self.index_cond.notifyAll() self._send_to_all({ 'action': 'init', 'indexing': self.indexing }) def index_page(self, page): self.index_altq.put(page) with self.index_cond: self.index_alt_switchoff = not self.indexing self.indexing = True self.index_cond.notifyAll() def toggle_crosssite_crawl(self, state=None): if state == None: self.crosssite_crawl = not self.crosssite_crawl else: self.crosssite_crawl = state self._send_to_all({ 'action': 'init', 'crosssite_crawl': self.crosssite_crawl })
class StreamingCurve(TaurusCurve): def __init__(self, name, xname=None, parent=None, rawData=None, optimized=False): super(StreamingCurve, self).__init__(name, xname, parent, rawData, optimized) self.setObjectName("StreamingCurve") self.buildStackThread() self.launchStackThread() def buildStackThread(self): ''' For the streaming feature is needed to have a stack with the events received. Then newer events will be processed earlier (discarding older ones). For this task, apart of the stack itself, is needed a lock to deal with a critical section, together with a wait condition to notify new data stacked. Finally a threading Event is used to report the thread the join action. ''' self._eventStack = LifoQueue() self._queueLock = RLock() self._queueManager = Thread(target=self.__streamingManager, name="StreamingManager") self._newDataAvailable = Event() self._endStreaming = Event() def launchStackThread(self): self._endStreaming.clear() self._newDataAvailable.clear() self._queueManager.start() def __del__(self): self._endStreaming.set() self._newDataAvailable.clear() def setObjectName(self, name): if name is not None and isinstance(name, str): self.log_name = name self.log_full_name = self.log_name self.log_obj = self._getLogger(self.log_full_name) def eventReceived(self, evt_src, evt_type, evt_value): ''' Usually this call should need a short time to be finished before the next event is received. To ensure this is a short time consume, the event received is simply stored in a stack and a event processor awakened. When the processor is awakened or just finish the last processing, it proceeds with the latest arrived (discarting the others). It can be made without the stack, but we've set it to allow the widget to report, at least in the logs, when and how many events has been dropped. ''' if hasattr(self, '_queueLock'): self.debug("%s receive an event" % (evt_src.name)) with self._queueLock: self._eventStack.put([evt_src, evt_type, evt_value]) self._newDataAvailable.set() else: self.warning("%s receive an event, but no locker, processing the " "event without the streaming feature" % (evt_src.name)) TaurusCurve.eventReceived(self, evt_src, evt_type, evt_value) def __streamingManager(self): ''' Main method of the background thread in charge of process stacked events and drop the ones that are too old in the stack. ''' self.info("Streaming manager thread created") while not self._endStreaming.isSet(): self.__processStreamingEvent() self.debug("Streaming manager %s go sleep!" % (self.modelName)) self._newDataAvailable.wait() self.debug("Streaming manager %s wake up!" % (self.modelName)) self.info("Queue process finish event...") def __processStreamingEvent(self): if not self._eventStack.empty(): with self._queueLock: # as its a lifo queue, get returns the last received evt_src, evt_type, evt_value = self._eventStack.get() self.__cleanQueue() self._newDataAvailable.clear() TaurusCurve.eventReceived(self, evt_src, evt_type, evt_value) def __cleanQueue(self): if not self._eventStack.empty(): self.warning("Dropping %d event(s) on the %s queue" % (self._eventStack.qsize(), self.modelName)) while not self._eventStack.empty(): self._eventStack.get()