class TaskManager: # noinspection PyPep8Naming def __init__(self, jobs_queue_capacity: int, workers_num: int, WorkerClass: Worker.__class__ = Worker): # empty job queue self._queue = JoinableQueue(maxsize=jobs_queue_capacity) logger.info( f'Queue size set to accept at most {jobs_queue_capacity} before pausing job assignment.' ) self.WorkerClass = WorkerClass self.workers_num = max_number_of_workers(workers_num) _workers = [] def wake_up_workers(self): self._workers: List[Worker] = [ self.WorkerClass(self._queue) for _ in range(self.workers_num) ] for worker in self._workers: worker.start() def assign_task(self, job: Task): self._queue.put(job) def stop_workers(self): logger.info('waiting all workers to finish') # usual termination condition is to put None on the queue. Queues are FIFO but from Python 3.8 docs: # https://docs.python.org/3.8/library/multiprocessing.html#pipes-and-queues # "If multiple processes are enqueuing objects, it is possible for the objects to be received at the other # end out-of-order. However, objects enqueued by the same process will always be in the expected order # with respect to each other.". So, when there's a single producer, that's not an issue; when there are many # producers it may happen that even if Nones are enqueued at the end of the queue, consumers pick 'em # before other items in the queue (breaking the FIFO assumption). In this case the workers would leave # before the queue is empty. To avid this, before sending Nones, it's better to wait for the queue to be # consumed. while not self._queue.empty( ): # not bullet-proof as empty() and qsize() return approx. values, but it helps print(f"jobs waiting to be assigned: {self._queue.qsize()}") sleep(1) for _ in self._workers: self._queue.put(None, block=True, timeout=None) self._queue.join() logger.info('all processes_finished') def discard_waiting_tasks(self): while not self._queue.empty(): try: self._queue.get(False) except Empty: continue self._queue.task_done() def number_of_waiting_tasks(self): return self._queue.qsize()
class ImageCrawler: NUM_PER_FETCH = 100 NUM_PROCESSES = 10 def __init__(self, database_config_path): self.queue = JoinableQueue() self.logger = Logger("image_crawler") self.adapter = ImageStoreAdapter(database_config_path, self.logger) def produce(self): while True: if self.queue.empty(): for image_id, link in self.adapter.load_undownloaded_images( self.NUM_PER_FETCH): self.logger.log("Producer: add new image to crawl:" + image_id + " " + link) self.queue.put((image_id, link)) time.sleep(10) def consume(self, process_id): while True: self.logger.log("Consumer process:" + str(process_id) + " fetch new image from queue") if not self.queue.empty(): image_id, link = self.queue.get() self.logger.log("Consumer process:" + str(process_id) + " start crawling " + str(link)) image = common_utils.page_crawl(link) if image != None: self.logger.log(link + "crawled successfully") self.adapter.store_image(image_id, image) else: self.logger.log(link + " failed at crawling") self.adapter.update_image_status( image_id, ImageIndexStatus.DOWNLOAD_FAILED) self.queue.task_done() time.sleep(1) else: self.logger.log("Queue empty") time.sleep(10) def run(self): producer = Process(target=self.produce) producer.start() consumers = [] for i in range(self.NUM_PROCESSES): consumer = Process(target=self.consume, args=(i, )) consumers.append(consumer) consumer.start() for consumer in consumers: consumer.join() producer.join() self.queue.join()
class ImageCrawler: NUM_PER_FETCH = 100 NUM_PROCESSES = 10 def __init__(self, database_config_path): self.queue = JoinableQueue() self.logger = Logger("image_crawler") self.adapter = ImageStoreAdapter(database_config_path, self.logger) def produce(self): while True: if self.queue.empty(): for image_id, link in self.adapter.load_undownloaded_images(self.NUM_PER_FETCH): self.logger.log("Producer: add new image to crawl:" + image_id + " " + link) self.queue.put((image_id, link)) time.sleep(10) def consume(self, process_id): while True: self.logger.log("Consumer process:" + str(process_id) + " fetch new image from queue") if not self.queue.empty(): image_id, link = self.queue.get() self.logger.log("Consumer process:"+ str(process_id) + " start crawling " + str(link)) image = common_utils.page_crawl(link) if image != None: self.logger.log(link + "crawled successfully") self.adapter.store_image(image_id, image) else: self.logger.log(link + " failed at crawling") self.adapter.update_image_status(image_id, ImageIndexStatus.DOWNLOAD_FAILED) self.queue.task_done() time.sleep(1) else: self.logger.log("Queue empty") time.sleep(10) def run(self): producer = Process(target=self.produce) producer.start() consumers = [] for i in range(self.NUM_PROCESSES): consumer = Process(target=self.consume, args=(i,)) consumers.append(consumer) consumer.start() for consumer in consumers: consumer.join() producer.join() self.queue.join()
def test_hyperband_executor_basic(self): # Create executor inputs_queue = JoinableQueue() results_queue = JoinableQueue() executor = executors.HyperbandExecutor.HyperbandExecutor( inputs_queue, results_queue, hyperband_epochs_budget=100) executor.get_data_loading_pipelines = get_data_loading_pipelines_override # Load sample data data_uri = utils.utils.get_git_root( os.path.dirname( os.path.abspath(__file__))) + "/test_data/185_baseball" assert (os.path.exists(data_uri)) problem_doc, dataset = utils.utils.load_data_from_dir(data_uri, mode="train") # Process item inputs_queue.put((problem_doc, dataset)) executor.start() inputs_queue.join() # Gather results results = [] while not results_queue.empty(): print("Gathering...") results.append( results_queue.get(True, executors.Executor.QUEUE_TIMEOUT)) executor.terminate()
def calculate_set(num_processes): todo_queue = JoinableQueue() results_queue = JoinableQueue() # setup and launch workers # we'll make them daemon processes so they shut down automatically when this process exits, but # we'll also shut them down ourselves when we finish workers = [ Process(target=worker, args=(todo_queue, results_queue)) for i in xrange(num_processes) ] for individual in workers: individual.daemon = True individual.start() result = numpy.zeros([ny, nx]) for i in xrange(ny): y = i * dy + ylo for j in xrange(nx): x = j * dx + xlo todo_queue.put((x, y, i, j)) todo_queue.join() while not results_queue.empty(): i, j, val = results_queue.get() result[i, j] = val results_queue.task_done() # shutdown the compute processes for individual in workers: individual.terminate() return result
def main(): jobs = JoinableQueue() result = JoinableQueue() numToProcess = -1 scores = pd.DataFrame(columns=['query','fmeasure','precision','recall', 'size','maxDistance','topHits',"contextSteps"]) print len(datasets) for key in datasets: jobs.put(key) processed_count = Counter() for i in xrange(NUMBER_OF_PROCESSES): p = Process(target=work, args=(i, jobs, result, processed_count)) p.daemon = True p.start() #work(1, jobs, result, processed_count) automated_annotations = {} distances = {} jobs.join() dataset_index = collections.defaultdict(set) annotated_datasets = set() while not result.empty(): dataset, classes = result.get() if len(classes) == 0: annotated_datasets.add(dataset) for c in classes.keys(): dataset_index[c].add(dataset) owl_class = Class(c, graph=graph) for parent in owl_class.parents: dataset_index[parent.identifier].add(dataset) result.task_done() print '\n' for query, c in queries.items(): manual = ground_truth[query] automated = dataset_index[c] hits = manual & automated misses = manual - automated precision = np.nan if len(automated) == 0 else float(len(hits)) / len(automated) recall = np.nan if len(manual) == 0 else float(len(hits)) / len(manual) if precision != 0 or recall != 0: fmeasure = 0 if np.isnan(precision) or np.isnan(recall) else 2 * (precision * recall) / (precision + recall) else: fmeasure = 0 scores = scores.append(dict(query=query, size=len(manual), precision=precision, recall=recall, fmeasure=fmeasure,topHits=topHits, maxDistance=maxDistance, contextSteps = context_steps), ignore_index=True) print "Hits for", query, c print '\n'.join(sorted(hits)) print scores print "Annotated", len(annotated_datasets), "datasets."
def progdev_all(boffile, gain): """ Initialize all roach boards with boffile and gain settings """ roachlist = ['rofl%i'%i for i in range(1,16+1)] n_roach = len(roachlist) print "Programming all roaches with %s"%boffile print "Gain value: %ix"%gain print "Please wait..." # Create threads and message queue procs = [] q = JoinableQueue() for i in range(n_roach): p = Process(target=progdev_adc16, args=(roachlist[i], q, boffile, gain)) procs.append(p) # Start threads for p in procs: p.start() # Join threads for p in procs: p.join() # Print messages while q.empty() is False: print q.get() print "OK"
def waiter(queue: multiprocessing.JoinableQueue) -> tuple: longest_word = None while True: results = queue.get() if results[2] is True: longest_word = results queue.task_done() if queue.empty(): print("Final queue is empty, waiting 1 second to be sure it's REALLY empty.") time.sleep(1) if queue.empty(): break return longest_word
class ScheduleContainer(object): def __init__(self): self.schedule_container = JoinableQueue(maxsize=0) self.scheduler = schedule self._run = True worker = Thread(target=self.work) worker.start() def append(self, request_form): self.schedule_container.put(request_form) @staticmethod def task(temp): def inner(): t.change_temp(temp) return inner def work(self): lock = Lock() while self._run: lock.acquire() if not self.schedule_container.empty(): schedule_obj = self.schedule_container.get() job = self.scheduler.every() job.start_day = str(schedule_obj.day) job.unit = 'weeks' job.at(str(schedule_obj.time)).do(self.task(schedule_obj.temp)) print('schedule made into job') schedule_obj.save() self.schedule_container.task_done() lock.release() schedule.run_pending() time.sleep(1)
def _parallel(self, reps, keys, treatments, num_threads): jobs = JoinableQueue() results = JoinableQueue() for t in treatments: kwargs = dict(zip(keys, t)) if keys else {} for r in range(reps): jobs.put((str(t), kwargs)) def op(jobs, results): while True: name, kwargs = jobs.get() res = self.sim.run(self.builder(**kwargs)) results.put((name, res)) jobs.task_done() for th in range(num_threads): process = Process(target=op, name=str(th), args=[jobs, results]) process.start() jobs.join() formatted_results = {} while not results.empty(): n, r = results.get() if n in formatted_results: formatted_results[n].append(r) else: formatted_results[n] = [r] if len(formatted_results) == 1: return formatted_results[list(formatted_results.keys())[0]] return formatted_results
def _drain_and_join_queue(q: mp.JoinableQueue, join: bool = True) -> None: """ Drains a queue completely, such that it is joinable :param q: Queue to join :param join: Whether to join the queue or not """ # Do nothing when it's not set if q is None: return # Call task done up to the point where we get a ValueError. We need to do this when child processes already # started processing on some tasks and got terminated half-way. n = 0 try: while True: q.task_done() n += 1 except ValueError: pass try: while not q.empty() or n != 0: q.get(block=True, timeout=1.0) n -= 1 except (queue.Empty, EOFError): pass # Join if join: q.join()
class BLEScanner: def __init__(self, devices_to_check_for, scan_delegate_class): self.manager = Manager() self.devices_to_check_for = self.manager.dict() for key in devices_to_check_for: self.devices_to_check_for[key] = devices_to_check_for[key] self.to_connect_queue = JoinableQueue() self.scanner = Scanner().withDelegate(scan_delegate_class(self)) self.stopped = False def start(self): self.stop_event = Event() self.stop_event.clear() self.process_connection_event = Event() self.process_connection_event.clear() # Need to use multiprocessing to start this in a new process # This is because the Scanner code in bluepy-helper sets the state of currently connected devices to "disconnected" # Thus, if the connected device code is in another thread waiting for notifications, this will # cause it to raise a BTLEDisconnectedError even though the device is actually still connected. # The workaround is to start the scanning in a new _process_ instead. This will create a whole new copy # of bluepy-helper, which allows the scanner to do whatever it want to the device state in its copy of # bluepy-helper and not have to worry about screwing up the device state in the connected BLE object self.process = Process(target=self.scan, args=()) self.process.start() print("Started Scan") return self def scan(self): while True: if self.stop_event.is_set(): return # Delay if still connecting (There are devices on the queue that are not done yet) self.to_connect_queue.join() # Done waiting: clear the process_connection_event bit self.process_connection_event.clear() self.devices = self.scanner.scan(5, passive=True) # If there are devices that we need to connect to, signal that we want to connect to them to the main process. # The following line will if not self.to_connect_queue.empty(): self.process_connection_event.set() def stop(self): # self.stopped = True self.stop_event.set()
def multi_write_selected_pfam_genes(options, useful_pfam, annot_genes_all): ''' Run "write_selected_pfam_genes" on multiple threads. ''' global q q = JoinableQueue() for fam in useful_pfam: q.put(fam) for i in range(options.threads): p = Process(target = write_selected_pfam_genes, name = '%i' % (i+1), args = (options, annot_genes_all)) p.start() sleep(options.threads*0.05) q.join() sleep(options.threads*0.05) if p.is_alive() and q.empty(): sleep(options.threads*0.2) if p.is_alive() and q.empty(): p.terminate()
def main(): jobs = JoinableQueue() result = JoinableQueue() print len(datasets) numToProcess = 10 scores = pd.DataFrame(columns=[ 'precision', 'recall', 'fmeasure', 'numResult', 'minScore', 'topHits', 'contentWeight', 'relationWeight' ]) manual_annotations = get_manual_annotations(numToProcess) manual_tuples = get_ir_tuples(manual_annotations) for key in manual_annotations.keys(): jobs.put(key) processed_count = Counter() for i in xrange(NUMBER_OF_PROCESSES): p = Process(target=work, args=(i, jobs, result, processed_count)) p.daemon = True p.start() #work(1, jobs, result, processed_count) automated_annotations = {} jobs.join() while not result.empty(): dataset, classes = result.get() automated_annotations[dataset] = classes result.task_done() automated_tuples = get_ir_tuples(automated_annotations) hits = manual_tuples & automated_tuples misses = manual_tuples - automated_tuples precision = float(len(hits)) / len(automated_tuples) recall = float(len(hits)) / len(manual_tuples) fmeasure = 2 * (precision * recall) / (precision + recall) # print '\t'.join([str(x) for x in [precision, recall, fmeasure, # numResult, minScore, topHits]]) scores = scores.append(dict(precision=precision, recall=recall, fmeasure=fmeasure, numResult=numResult, minScore=minScore, topHits=topHits, contentWeight=contentWeight, relationWeight=relationWeight), ignore_index=True) print scores
class findFile(object): def __init__(self): self.input_Q = JoinableQueue() self.out_Q = JoinableQueue() def run(self): p1 = Process(target=self.funcA()) p2 = Process(target=self.funcB()) p3 = Process(target=self.funcB()) p1.start() p2.start() p3.start() p1.join() p2.join() p3.join() self.input_Q.join() l = [] while self.out_Q.empty() is False: l.append(self.out_Q.get()) print('l = ', l) print("p1.is_alive() = ", p1.is_alive()) print("p2.is_alive() = ", p2.is_alive()) print("p3.is_alive() = ", p3.is_alive()) def funcA(self): while True: find_str = input('请输入要查找的字符串:') if 'exit' == find_str.lower(): break self.input_Q.put(find_str) def funcB(self): while True: if self.input_Q.empty(): break find_str = self.input_Q.get() self.out_Q.put(find_str) print('find_str = ', find_str) self.input_Q.task_done()
def main(): jobs = JoinableQueue() result = JoinableQueue() print len(datasets) numToProcess = 10 scores = pd.DataFrame(columns=['precision','recall','fmeasure', 'numResult','minScore','topHits', 'contentWeight','relationWeight']) manual_annotations = get_manual_annotations(numToProcess) manual_tuples = get_ir_tuples(manual_annotations) for key in manual_annotations.keys(): jobs.put(key) processed_count = Counter() for i in xrange(NUMBER_OF_PROCESSES): p = Process(target=work, args=(i, jobs, result, processed_count)) p.daemon = True p.start() #work(1, jobs, result, processed_count) automated_annotations = {} jobs.join() while not result.empty(): dataset, classes = result.get() automated_annotations[dataset] = classes result.task_done() automated_tuples = get_ir_tuples(automated_annotations) hits = manual_tuples & automated_tuples misses = manual_tuples - automated_tuples precision = float(len(hits)) / len(automated_tuples) recall = float(len(hits)) / len(manual_tuples) fmeasure = 2 * (precision * recall) / (precision + recall) # print '\t'.join([str(x) for x in [precision, recall, fmeasure, # numResult, minScore, topHits]]) scores = scores.append(dict(precision=precision, recall=recall, fmeasure=fmeasure, numResult=numResult, minScore=minScore, topHits=topHits, contentWeight=contentWeight, relationWeight=relationWeight), ignore_index=True) print scores
def test_basic(): in_queue = JoinableQueue() mysql_reader = Mysqlio('localhost','3600','test','root','') mysql_reader.scan_and_queue(in_queue,"SELECT * FROM swallow") assert in_queue.qsize() == 3 res = [] while not in_queue.empty(): res.append(in_queue.get()) expected_res = [{'id':1,'libelle':'test'},{'id':2,'libelle':'john'},{'id':3,'libelle':'woo'}] assert res == expected_res
def _make_concept_pair(self): q = JoinableQueue() done_q = Queue() threads = [] for i in range(THREAD): t = Process(target=_evaluate_pair_word, args=(self.model, self.w_ij, q, done_q, self.sim, self.threshold)) t.start() threads.append(t) t = Process(target=_print_progress, args=(q, )) t.start() threads.append(t) for cluster in self._clusters: lc_0 = set() lc_1 = set() for sen in cluster: tup = self.d_sentence(id(sen)) if tup[0] == 0: lc_0.update(self.s_ik[tup[0]][tup[1]]) else: lc_1.update(self.s_ik[tup[0]][tup[1]]) for c_0 in lc_0: for c_1 in lc_1: tup = ((self.d_concept[c_0][1], c_0), (self.d_concept[c_1][1], c_1)) q.put(tup) logger.info('Queuing complete') while not q.empty() or not done_q.empty(): try: tup = done_q.get() j = tup[0][0] k = tup[0][1] self.u_jk[(j, k)] = tup[1] except queue.Empty: pass # block until all tasks are done q.join() # stop workers for t in threads: t.terminate() print('Nb pair : ' + str(len(self.u_jk)))
def process_modules_worker(cls, queue: multiprocessing.JoinableQueue) -> None: while True: if queue.empty(): continue print(f"{os.getpid()}") q = queue.get_nowait() irc: IrcClient = q[0] message:str = q[1] if not message: continue irc.process_private_message(irc, message) queue.task_done()
class RealTimePlotSimulation: def __init__(self, SimulationClass, remoteplotter): self.remoteplotter = remoteplotter self.SimulationClass = SimulationClass self.plotter_conn, worker_conn = Pipe() self.queue = JoinableQueue(maxsize=1) self.axesextent = remoteplotter.axesExtent() self.plotter_conn.send(('axes',self.axesextent)) phantomprocess = Process(target=self.work, args=(self.queue, worker_conn, SimulationClass, )) phantomprocess.start() anim = animation.FuncAnimation(remoteplotter.fig, self.update_plot, interval=100) show() # The phantom process is properly stopped print 'Killing the job...' self.plotter_conn.send(('gokillyourself',)) phantomprocess.join() print 'Done.' def update_plot(self, num): extent = self.remoteplotter.axesExtent() if extent != self.axesextent: self.plotter_conn.send(('axes',extent)) self.axesextent = extent if self.queue.empty(): return data = self.queue.get() self.remoteplotter.plotData(data) self.queue.task_done() def work(self, queue, worker_conn, SimulationClass): s = SimulationClass() # Main loop for time in s.simulation: s.inmainloop() # Check for orders if worker_conn.poll(): message = worker_conn.recv() if message[0] == 'gokillyourself': print 'Suicide order received. Exiting main loop.' break if message[0] == 'axes': extent = message[1] if queue.empty(): # Retreive data queue.put(self.remoteplotter.pickableData(s.simulation, extent)) queue.cancel_join_thread()
def test_sklearn_stacked_lstm_executor(self): git_root_path = utils.utils.get_git_root( os.path.dirname(os.path.abspath(__file__))) # Create executor inputs_queue = JoinableQueue() results_queue = JoinableQueue() lstm_path = git_root_path + "/Stanford-D3M-Full/experimental/lstm_predictor/saved_models/20190130/" assert (os.path.exists(lstm_path)) executor = executors.SklearnStackedLSTMExecutor.SklearnStackedLSTMExecutor( inputs_queue, results_queue, override_sklearn_primitives_set=[ "d3m.primitives.regression.linear_svr.SKlearn", "d3m.primitives.regression.gaussian_process.SKlearn" ], lstm_path=lstm_path) executor.get_data_loading_pipelines = get_data_loading_pipelines_override # Load sample data data_uri = git_root_path + "/test_data/185_baseball" assert (os.path.exists(data_uri)) problem_doc, dataset = utils.utils.load_data_from_dir(data_uri, mode="train") # Process item inputs_queue.put((problem_doc, dataset)) inputs_queue.put((problem_doc, dataset)) inputs_queue.put((problem_doc, dataset)) executor.start() inputs_queue.join() # Gather results results = [] while not results_queue.empty(): print("Gathering...") results.append( results_queue.get(True, executors.Executor.QUEUE_TIMEOUT)) executor.terminate() for result in results: assert None not in result
def test(tasks_queue: mp.JoinableQueue, result_queue: mp.Queue): while not tasks_queue.empty(): result = list() task = tasks_queue.get() # modified code from readme.md apk_info = APKInfo(task) for field in sorted([ getattr(apk_info, m) for m in dir(apk_info) if not m.startswith("_") ], key=lambda x: callable(x)): result.append( f"{field.__name__:25}: {field()}" if callable(field) else str(field) if not isinstance(field, APKOpener) else os.path.basename(task)) # подменим первое поле на имя пакета result_queue.put(result) tasks_queue.task_done()
def z_from_u_worker(q: mp.JoinableQueue, function, grid, u_values, z_values): """ z_from_u unit function in case of multiprocessing :param q: :param function: :param grid: :param u_values: :param z_values: :return: """ while not q.empty(): i = q.get() a_loc = grid.loc[grid['u_values'] <= u_values[i]] a_loc = a_loc.iloc[len(a_loc) - 1:len(a_loc)].index[0] b_loc = grid.loc[grid['u_values'] >= u_values[i]].index[0] z_values[i] = brentq(f=lambda x: function(x, u_values[i]), a=grid.iloc[a_loc, 0], b=grid.iloc[b_loc, 0]) q.task_done()
class SimpleTaskWorkerManager(object): # os.cpu_count() also for default? """ this will use a queue to keep track of all the total tasks and assign a fixed amount of workers to finish all of the tasks. """ def __init__(self, tasks=None, max_workers=8): self.workers = [] if tasks is None: self.tasks = JoinableQueue() else: self.tasks = tasks self.total_tasks = 0 self.num_workers = max_workers def __nonzero__(self): if self.tasks.empty(): return True return False def add_task_to_queue(self, task): try: self.tasks.put(task) self.total_tasks += 1 except Exception as e: print 'unable to add worker to queue. error code: %s' % str(e) def start_working(self): if self.num_workers > self.total_tasks: self.num_workers = self.total_tasks print 'assigning %d workers to help you.' % self.num_workers for _ in range(self.num_workers): p = SimpleTaskWorker(args=(self.tasks, )) p.start() self.workers.append(p) self.tasks.join() def wait_all(self): for worker in self.workers: self.tasks.put(None) for worker in self.workers: worker.join()
def test_simple_random_sklearn_executor(self): # Create executor inputs_queue = JoinableQueue() results_queue = JoinableQueue() executor = executors.SimpleRandomSklearnExecutor.SimpleRandomSklearnExecutor( inputs_queue, results_queue, override_sklearn_primitives_set=[ "d3m.primitives.regression.linear_svr.SKlearn" ]) executor.get_data_loading_pipelines = get_data_loading_pipelines_override # Load sample data data_uri = utils.utils.get_git_root( os.path.dirname( os.path.abspath(__file__))) + "/test_data/185_baseball" assert (os.path.exists(data_uri)) problem_doc, dataset = utils.utils.load_data_from_dir(data_uri, mode="train") # Process item inputs_queue.put((problem_doc, dataset)) inputs_queue.put((problem_doc, dataset)) inputs_queue.put((problem_doc, dataset)) executor.start() inputs_queue.join() # Gather results results = [] while not results_queue.empty(): print("Gathering...") results.append( results_queue.get(True, executors.Executor.QUEUE_TIMEOUT)) executor.terminate() assert (len(results) == 3) for result in results: assert None not in result
class FileReader(Process): def __init__(self, filename, buffer_size=1000): super(FileReader, self).__init__() self.filename = filename self.que = JoinableQueue(buffer_size) self.event = Event() self.event.set() self.started = Event() self.started.clear() # It's crucial to call task_done on the queue after the item was processed def get_queue(self): return self.que def get_event(self): return self.event def is_done(self): return not self.event.is_set() and self.que.empty() def run(self): self.started.set() self.proc() self.event.clear() def proc(self): with open_gz(self.filename, encoding='utf-8') as file: for line in file: self.que.put(line) def __iter__(self): self.start() self.started.wait() while not self.is_done(): try: text = self.que.get(timeout=0.1) yield text self.que.task_done() except Empty: pass
def boss(ebs, worker, iterable): """ Boss Process :type ebs: EBSSnapshot :type worker: Callable :param iterable: :return: """ logger = getLogger('ebssnapshot.boss') jobqueue = JoinableQueue(ebs.workers) procs = [] for i in range(1, ebs.workers + 1): proc = Process(target=worker, args=[ i, jobqueue, ebs.region, ebs.description, ebs.uuid, ebs.role, ebs.session() ]) proc.daemon = True proc.start() procs.append(proc) signal.signal(signal.SIGINT, terminate) signal.signal(signal.SIGTERM, terminate) for job in iterable: while True: running = any(p.is_alive() for p in procs) if not running: logger.fatal('No children are alive: Exiting') sys.exit(-1) if jobqueue.empty(): jobqueue.put(job, block=True, timeout=60) break jobqueue.join()
def test_basic(): in_queue = JoinableQueue() mysql_reader = Mysqlio('localhost', '3600', 'test', 'root', '') mysql_reader.scan_and_queue(in_queue, "SELECT * FROM swallow") assert in_queue.qsize() == 3 res = [] while not in_queue.empty(): res.append(in_queue.get()) expected_res = [{ 'id': 1, 'libelle': 'test' }, { 'id': 2, 'libelle': 'john' }, { 'id': 3, 'libelle': 'woo' }] assert res == expected_res
def main(): global L2C_BEGIN, L2C_END, L2C_DELTA, L2G_BEGIN, L2G_END, L2G_DELTA global NFOLDS, ADD_ARGS, SVM_TRAIN, TRAIN_DATA, N_PER_SSH parser = OptionParser( usage="usage: %prog [options] <dataset> <gridscore-file>") parser.add_option("--log2c", dest="log2c", metavar="BEGIN END STEP", type='float', nargs=3, default=(L2C_BEGIN, L2C_END, L2C_DELTA), help="log2 of C SVM contraint [default: %default]") parser.add_option("--log2g", dest="log2g", metavar="BEGIN END STEP", type='float', nargs=3, default=(L2G_BEGIN, L2G_END, L2G_DELTA), help="log2 of G SVM contraint [default: %default]") parser.add_option("-v", "--fold", dest="fold", metavar="FOLD", type='int', default=NFOLDS, help="number of cross validation folds [default: %default]") parser.add_option("-a", "--args", dest="args", metavar="ARGS", type='string', default=ADD_ARGS, help="additional arguments to the SVM trainer [default: %default]") parser.add_option("--svm-train", dest="svm_train", metavar="PATHNAME", type='string', default=SVM_TRAIN, help="path of SVM trainer [default: %default]") (options, args) = parser.parse_args() if len(args) != 2: parser.print_usage(file=sys.stderr) return 1 L2C_BEGIN, L2C_END, L2C_DELTA = options.log2c L2G_BEGIN, L2G_END, L2G_DELTA = options.log2g NFOLDS = options.fold ADD_ARGS = options.args SVM_TRAIN = options.svm_train TRAIN_DATA, outfile = args job_queue = Queue() result_queue = Queue() for log2c, log2g in product( frange(L2C_BEGIN, L2C_END, L2C_DELTA), frange(L2G_BEGIN, L2G_END, L2G_DELTA)): job_queue.put((log2c, log2g)) for i in range(LOCAL_WORKERS): LocalWorker('local-%d' % i, job_queue, result_queue).start() for i, host in enumerate(SSH_WORKERS): for j in range(N_PER_SSH): SSHWorker('ssh-%d/%d' % (i, j), host, job_queue, result_queue).start() #block until all jobs are done job_queue.join() result = [] while not result_queue.empty(): result.append(result_queue.get()) result = sorted(result, key=op.itemgetter(3,1,2), reverse=True) _, best_log2c, best_log2g, best_score = max(result, key=op.itemgetter(3,1,2)) with open(outfile, 'w') as ofp: ofp.write("#best result: log2c=%f, log2g=%f, score=%f\n" % \ (best_log2c, best_log2g, best_score)) ofp.write("#log2(c)\tlog2(g)\tscore\n") for (name, log2c, log2g, score) in result: ofp.write("%f\t%f\t%f\n" % (log2c, log2g, score)) return 0
class _PrPipe(object): """Custom pipe manager to capture the output of processes and store them in dedicated thread-safe queues. Clients register their own queues. """ def __init__(self, pipeHandle): """ Args: pipeHandle (pipe): Pipe to monitor for records """ self.id = ''.join( [random.choice('0123456789ABCDEF') for x in range(6)]) self.queue = JoinableQueue(MAX_QUEUE_LENGTH) self.process = Process(target=self.enqueue_output, kwargs={ "out": pipeHandle, "queue": self.queue }) self.process.daemon = True self.process.start() self.clientQueuesLock = Lock() self.clientQueues = dict() self.lastClientId = 0 # Class contains Locks and Queues which cannot be pickled def __getstate__(self): """Prevent _PrPipe from being pickled across Processes Raises: Exception """ raise Exception("Don't pickle me!") def enqueue_output(self, out, queue): """Copy lines from a given pipe handle into a local threading.Queue Runs in a separate process, started by __init__ Args: out (pipe): Pipe to read from queue (Queue): Queue to write to """ for line in iter(out.readline, b''): queue.put(line.decode('utf-8')) out.close() def publish(self): """Push messages from the main queue to all client queues Must be triggered by an external mechanism Typically triggered by getLine or wait """ try: while not self.queue.empty(): with self.clientQueuesLock: line = self.queue.get_nowait() for q in list(self.clientQueues.values()): q.put(line) self.queue.task_done() except Empty: pass def getQueue(self, clientId): """Retrieve a client's Queue proxy object Args: clientId (string): ID of the client Returns: QueueProxy """ return self.clientQueues[text(clientId)] def isEmpty(self, clientId=None): """Checks whether the primary Queue or any clients' Queues are empty Returns True ONLY if ALL queues are empty if clientId is None Returns True ONLY if both main queue and specfied client queue are empty when clientId is provided Args: clientId (string): ID of the client Returns: bool """ if clientId is not None: return self.queue.empty() \ and self.getQueue(clientId).empty() else: empty = self.queue.empty() with self.clientQueuesLock: for q in list(self.clientQueues.values()): empty = empty and q.empty() return empty def is_alive(self): """Check whether the thread managing the pipe > Queue movement is still active Returns: bool """ return self.process.is_alive() def getLine(self, clientId): """Retrieve a line from a given client's Queue Args: clientId (string): ID of the client Returns: <element from Queue> Raises: Empty """ # Pull any newer lines self.publish() # Throws Empty q = self.getQueue(clientId) line = q.get_nowait() q.task_done() return line def registerClientQueue(self, queueProxy): """Attach an additional Queue proxy to this _PrPipe All elements published() from now on will also be added to this Queue Returns the clientId for the new client, which must be used in all future interaction with this _PrPipe Args: queueProxy (QueueProxy): Proxy object to a Queue we should populate Returns: string. The client's ID for acccess to this queue """ # Make sure we don't re-use a clientId clientId = self.lastClientId + 1 self.lastClientId = clientId with self.clientQueuesLock: self.clientQueues[text(clientId)] = queueProxy return text(clientId) def unRegisterClientQueue(self, clientId): """Detach a Queue proxy from this _PrPipe Returns the clientId that was removed Args: clientId (string): ID of the client Returns: string. ID of the client queue """ with self.clientQueuesLock: self.clientQueues.pop(clientId) return text(clientId) def destructiveAudit(self): """Print a line from each client Queue attached to this _PrPipe This is a destructive operation, as it *removes* a line from each Queue """ with self.clientQueuesLock: for clientId in list(self.clientQueues): try: print("clientId " + text(clientId) + ": " + self.getLine(clientId)) except: print("clientId " + text(clientId) + " is empty")
def main(): global L2C_BEGIN, L2C_END, L2C_DELTA, L2G_BEGIN, L2G_END, L2G_DELTA global NFOLDS, ADD_ARGS, SVM_TRAIN, TRAIN_DATA, N_PER_SSH parser = OptionParser( usage="usage: %prog [options] <dataset> <gridscore-file>") parser.add_option("--log2c", dest="log2c", metavar="BEGIN END STEP", type='float', nargs=3, default=(L2C_BEGIN, L2C_END, L2C_DELTA), help="log2 of C SVM contraint [default: %default]") parser.add_option("--log2g", dest="log2g", metavar="BEGIN END STEP", type='float', nargs=3, default=(L2G_BEGIN, L2G_END, L2G_DELTA), help="log2 of G SVM contraint [default: %default]") parser.add_option( "-v", "--fold", dest="fold", metavar="FOLD", type='int', default=NFOLDS, help="number of cross validation folds [default: %default]") parser.add_option( "-a", "--args", dest="args", metavar="ARGS", type='string', default=ADD_ARGS, help="additional arguments to the SVM trainer [default: %default]") parser.add_option("--svm-train", dest="svm_train", metavar="PATHNAME", type='string', default=SVM_TRAIN, help="path of SVM trainer [default: %default]") (options, args) = parser.parse_args() if len(args) != 2: parser.print_usage(file=sys.stderr) return 1 L2C_BEGIN, L2C_END, L2C_DELTA = options.log2c L2G_BEGIN, L2G_END, L2G_DELTA = options.log2g NFOLDS = options.fold ADD_ARGS = options.args SVM_TRAIN = options.svm_train TRAIN_DATA, outfile = args job_queue = Queue() result_queue = Queue() for log2c, log2g in product(frange(L2C_BEGIN, L2C_END, L2C_DELTA), frange(L2G_BEGIN, L2G_END, L2G_DELTA)): job_queue.put((log2c, log2g)) for i in range(LOCAL_WORKERS): LocalWorker('local-%d' % i, job_queue, result_queue).start() for i, host in enumerate(SSH_WORKERS): for j in range(N_PER_SSH): SSHWorker('ssh-%d/%d' % (i, j), host, job_queue, result_queue).start() #block until all jobs are done job_queue.join() result = [] while not result_queue.empty(): result.append(result_queue.get()) result = sorted(result, key=op.itemgetter(3, 1, 2), reverse=True) _, best_log2c, best_log2g, best_score = max(result, key=op.itemgetter(3, 1, 2)) with open(outfile, 'w') as ofp: ofp.write("#best result: log2c=%f, log2g=%f, score=%f\n" % \ (best_log2c, best_log2g, best_score)) ofp.write("#log2(c)\tlog2(g)\tscore\n") for (name, log2c, log2g, score) in result: ofp.write("%f\t%f\t%f\n" % (log2c, log2g, score)) return 0
return [*file_hashes] if __name__ == '__main__': freeze_support() print(perf_counter()) with EXECUTOR(max_workers=MAX_WORKERS) as executor: in_dirs = 0 out_dirs = 1 in_list = list() out_list = [ basepath, ] hash_count = 0 sleeps = 0 while not q.empty() and q.qsize(): item = q.get() print(f'{in_dirs}: {item}') in_list.append(item) in_dirs += 1 future_dirs = executor.submit(dir_worker, item) dirs = future_dirs.result() q_dirs = [*map(q.put, dirs)] #print(dirs) out_dirs += len(dirs) out_list.extend(dirs) future_hashes = executor.submit(hash_worker, item) q_hashes = [*map(hash_q.put, dirs)] hashes = future_hashes.result() print(hashes) hash_count += len(hashes)
def main(): jobs = JoinableQueue() result = JoinableQueue() numToProcess = -1 scores = pd.DataFrame(columns=['fmeasure','precision','recall', 'numResult','maxDistance','topHits', 'contentWeight','relationWeight', 'hits', "contextSteps"]) manual_annotations = get_manual_annotations(numToProcess) manual_tuples = get_ir_tuples(manual_annotations) print len(manual_annotations) for i in range(weighted_kmeans_clustering_passes): print "Training pass",i+1 train_kmeans(manual_annotations.keys(), target_class_subtree) print "Complete." print "Training LSA..." lsa_model = train_lsa(manual_annotations) global useLSA useLSA = True global idf, lsa_model, target_classes, targets, target_class_subtree target_classes, idf, lsa_model = vectorize_ontology(graph, idf, lsa_model) subtree = set(graph.transitive_subjects(RDFS.subClassOf, oboe.MeasurementType)) target_class_subtree = [x for x in target_classes if x.identifier in subtree and x.identifier != oboe.MeasurementType] targets = dict([(x.identifier, x) for x in target_class_subtree]) print "Done." for key in manual_annotations.keys(): jobs.put(key) processed_count = Counter() #for i in xrange(NUMBER_OF_PROCESSES): # p = Process(target=work, args=(i, jobs, result, processed_count)) # p.daemon = True # p.start() work(1, jobs, result, processed_count) automated_annotations = {} distances = {} jobs.join() while not result.empty(): dataset, classes = result.get() automated_annotations[dataset] = set(classes.keys()) distances[dataset] = classes result.task_done() automated_tuples = get_ir_tuples(automated_annotations) hits = manual_tuples & automated_tuples misses = manual_tuples - automated_tuples precision = float(len(hits)) / len(automated_tuples) recall = float(len(hits)) / len(manual_tuples) fmeasure = 2 * (precision * recall) / (precision + recall) # print '\t'.join([str(x) for x in [precision, recall, fmeasure, # numResult, minScore, topHits]]) scores = scores.append(dict(precision=precision, recall=recall, fmeasure=fmeasure, hits=len(manual_tuples),topHits=topHits, maxDistance=maxDistance, contextSteps = context_steps), ignore_index=True) print '\n' print scores results_file = 'results.csv' if len(sys.argv) > 1: results_file = sys.argv[1] hit_curves = csv.writer(open(results_file,'wb'),delimiter=",") hit_curves.writerow(['dataset','class','distance','hit']) for dataset, c in automated_tuples: distance = round(distances[dataset][c],3) hit = 1 if (dataset,c) in manual_tuples else 0 hit_curves.writerow([dataset,c,distance,hit])
class FindText(BaseWorkerCustomer): NUM_WORKING_PROCESSES = 2 def __init__(self, params, *args, **kwargs): super(FindText, self).__init__(*args, **kwargs) self.path = params.get("path", "/") self.text = params.get("text", "") self.params = params # file queue to be processed by many threads self.file_queue = JoinableQueue(maxsize=0) self.result_queue = Queue(maxsize=0) self.result = [] self.is_alive = {"status": True} self.re_text = re.compile(".*" + fnmatch.translate(self.text)[:-7] + ".*", re.UNICODE | re.IGNORECASE) # remove \Z(?ms) from end of result expression def run(self): try: self.preload() except Exception as e: result = {"error": True, "message": str(e), "traceback": traceback.format_exc()} self.on_error(self.status_id, result, pid=self.pid, pname=self.name) return def worker(re_text, file_queue, result_queue, logger, timeout): while int(time.time()) < timeout: if file_queue.empty() is not True: f_path = file_queue.get() try: if not is_binary(f_path): mime = mimetypes.guess_type(f_path)[0] # исключаем некоторые mime типы из поиска if mime not in ["application/pdf", "application/rar"]: with open(f_path, "rb") as fp: for line in fp: try: line = as_unicode(line) except UnicodeDecodeError: charset = chardet.detect(line) if charset.get("encoding") in ["MacCyrillic"]: detected = "windows-1251" else: detected = charset.get("encoding") if detected is None: break try: line = str(line, detected, "replace") except LookupError: pass if re_text.match(line) is not None: result_queue.put(f_path) # logger.debug("matched file = %s " % f_path) break except UnicodeDecodeError as unicode_e: logger.error("UnicodeDecodeError %s, %s" % (str(unicode_e), traceback.format_exc())) except IOError as io_e: logger.error("IOError %s, %s" % (str(io_e), traceback.format_exc())) except Exception as other_e: logger.error("Exception %s, %s" % (str(other_e), traceback.format_exc())) finally: file_queue.task_done() else: time.sleep(REQUEST_DELAY) try: self.logger.debug("findText started with timeout = %s" % TIMEOUT_LIMIT) time_limit = int(time.time()) + TIMEOUT_LIMIT # Launches a number of worker threads to perform operations using the queue of inputs for i in range(self.NUM_WORKING_PROCESSES): p = Process( target=worker, args=(self.re_text, self.file_queue, self.result_queue, self.logger, time_limit) ) p.start() proc = psutil.Process(p.pid) proc.ionice(psutil.IOPRIO_CLASS_IDLE) proc.nice(20) self.logger.debug( "Search worker #%s, set ionice = idle and nice = 20 for pid %s" % (str(i), str(p.pid)) ) self.processes.append(p) abs_path = self.get_abs_path(self.path) self.logger.debug("FM FindText worker run(), abs_path = %s" % abs_path) if not os.path.exists(abs_path): raise Exception("Provided path not exist") self.on_running(self.status_id, pid=self.pid, pname=self.name) for current, dirs, files in os.walk(abs_path): for f in files: try: file_path = os.path.join(current, f) self.file_queue.put(file_path) except UnicodeDecodeError as e: self.logger.error("UnicodeDecodeError %s, %s" % (str(e), traceback.format_exc())) except IOError as e: self.logger.error("IOError %s, %s" % (str(e), traceback.format_exc())) except Exception as e: self.logger.error("Exception %s, %s" % (str(e), traceback.format_exc())) while int(time.time()) <= time_limit: self.logger.debug( "file_queue size = %s , empty = %s (timeout: %s/%s)" % (self.file_queue.qsize(), self.file_queue.empty(), str(int(time.time())), time_limit) ) if self.file_queue.empty(): self.logger.debug("join() file_queue until workers done jobs") self.file_queue.join() break else: time.sleep(REQUEST_DELAY) if int(time.time()) > time_limit: self.is_alive["status"] = False for p in self.processes: try: self.logger.debug("FM FindText terminate worker process, pid = %s" % p.pid) kill(p.pid, signal.SIGKILL, self.logger) except OSError: self.logger.error("FindText unable to terminate worker process, pid = %s" % p.pid) if self.is_alive["status"] is True: while not self.result_queue.empty(): file_path = self.result_queue.get() self.result.append(self._make_file_info(file_path)) self.on_success(self.status_id, data=self.result, pid=self.pid, pname=self.name) else: result = {"error": True, "message": "Operation timeout exceeded", "traceback": ""} self.on_error(self.status_id, result, pid=self.pid, pname=self.name) except Exception as e: result = {"error": True, "message": str(e), "traceback": traceback.format_exc()} self.on_error(self.status_id, result, pid=self.pid, pname=self.name)
queue_log = JoinableQueue() test_commprocess = CommProcess(port=8080,address='127.0.0.1',events={'enable_comms':event_enable_comms,'client_disconnect':event_client_disconnect},queues={'tx_msg':queue_tx,'rx_msg':queue_rx,'log':queue_log},debug_log=debug_log_path) retcode = 0 try: event_enable_comms.set() event_enable_comms.set() test_commprocess.start() while True: if queue_log.empty()==False: print pop_queue(queue_log) if queue_rx.empty()==False: msg = pop_queue(queue_rx) print msg queue_tx.put("echoing %s\n"%msg) ## check if client disconnected : if event_client_disconnect.wait(PROCESS_EVENT_CLIENT_DISCONNECT_TIMEOUT_S): ## check if the commprocess is still alive : if not test_commprocess.is_alive(): ## what happened ? commprocess_retcode = test_commprocess.exitcode
def main(fileName): # load test config and default values with open(fileName, 'r') as f: tests = json.load(f) prepare_tests_settings(tests) default = tests['default'] jobQueue = JoinableQueue() resultQueue = JoinableQueue() # NOTE: some parameters are obsolete as they are overruled by the parameters in individual tests if default['browser'].lower() == 'chrome': # use producer-consumer mode for chrome # this mode helps isolating individual failures # as well as supporting parallel browsers workers = start_parallel_instances(default, jobQueue, resultQueue) dispatch_parallel_tests(tests, jobQueue) def terminate_jobs(_, __): logging.warning("SIGINT: terminating all the intances ") for worker in workers: # SIGTERM will trigger teardown function of the workers # so that they could nicely kill the processes (chrome, Xvfb) they started os.kill(worker.pid, signal.SIGTERM) time.sleep(0.5) sys.exit(-1) # SIGINT is for nice teardown # NOTE: if SIGKILL this process, there could be orphan processes that stop new tests # one must manually kill them if that happens signal.signal(signal.SIGINT, terminate_jobs) #loader = ChromeLoader(disable_quic=default['disable_quic'], disable_spdy=default['disable_spdy'], # check_protocol_availability=False, save_packet_capture=True, # log_ssl_keys=default['log_ssl_keys'], save_har=True, disable_local_cache=False, # headless=default['headless'], ignore_certificate_errors=default['ignore_certificate_errors']) #loader.load_pages(tests) #pprint.pprint(dict(loader.load_results)) # then wait for the queue to be empty jobQueue.join() while not resultQueue.empty(): # print all the test reports result = resultQueue.get(False) print result resultQueue.task_done() # send teardown message then wait teardown_parallel_instances(default, jobQueue) jobQueue.join() elif default['browser'].lower() == 'firefox': # simplier single thread mode for firefox loader = FirefoxLoader(disable_quic=default['disable_quic'], disable_spdy=default['disable_spdy'], check_protocol_availability=False, save_packet_capture=True, log_ssl_keys=default['log_ssl_keys'], save_har=True, disable_local_cache=False, headless=default['headless'], ignore_certificate_errors=default['ignore_certificate_errors']) loader.load_pages(tests) pprint.pprint(dict(loader.load_results)) else: logging.critical('Uknown browser %s', default['browser'].lower()) sys.exit(-1)
class FindText(BaseWorkerCustomer): NUM_WORKING_PROCESSES = 2 def __init__(self, params, session, *args, **kwargs): super(FindText, self).__init__(*args, **kwargs) self.path = params.get('path', '/') self.session = session self.session = session self.text = params.get('text', '') self.params = params # file queue to be processed by many threads self.file_queue = JoinableQueue(maxsize=0) self.result_queue = Queue(maxsize=0) self.result = [] self.is_alive = { "status": True } self.re_text = re.compile('.*' + fnmatch.translate(self.text)[:-7] + '.*', re.UNICODE | re.IGNORECASE) # remove \Z(?ms) from end of result expression def run(self): try: self.preload() sftp = self.get_sftp_connection(self.session) self.logger.debug("findText started with timeout = %s" % TIMEOUT_LIMIT) time_limit = int(time.time()) + TIMEOUT_LIMIT # Launches a number of worker threads to perform operations using the queue of inputs sftp_managers = [] for i in range(self.NUM_WORKING_PROCESSES): p = Process(target=self.worker, args=(self.re_text, self.file_queue, self.result_queue, time_limit)) p.start() proc = psutil.Process(p.pid) proc.ionice(psutil.IOPRIO_CLASS_IDLE) proc.nice(20) self.logger.debug( "Search worker #%s, set ionice = idle and nice = 20 for pid %s" % ( str(i), str(p.pid))) self.processes.append(p) abs_path = self.path self.logger.debug("FM FindText worker run(), abs_path = %s" % abs_path) if not sftp.exists(abs_path): raise Exception("Provided path not exist") self.on_running(self.status_id, pid=self.pid, pname=self.name) for current, dirs, files in sftp.walk(abs_path): for f in files: try: file_path = os.path.join(current, f) self.file_queue.put(file_path) except UnicodeDecodeError as e: self.logger.error( "UnicodeDecodeError %s, %s" % (str(e), traceback.format_exc())) except IOError as e: self.logger.error("IOError %s, %s" % (str(e), traceback.format_exc())) except Exception as e: self.logger.error( "Exception %s, %s" % (str(e), traceback.format_exc())) while int(time.time()) <= time_limit: self.logger.debug("file_queue size = %s , empty = %s (timeout: %s/%s)" % ( self.file_queue.qsize(), self.file_queue.empty(), str(int(time.time())), time_limit)) if self.file_queue.empty(): self.logger.debug("join() file_queue until workers done jobs") self.file_queue.join() break else: time.sleep(REQUEST_DELAY) if int(time.time()) > time_limit: self.is_alive['status'] = False for sftp in sftp_managers: sftp.conn.close() for p in self.processes: try: self.logger.debug("FM FindText terminate worker process, pid = %s" % p.pid) kill(p.pid, signal.SIGKILL, self.logger) except OSError: self.logger.error( "FindText unable to terminate worker process, pid = %s" % p.pid) if self.is_alive['status'] is True: while not self.result_queue.empty(): file_path = self.result_queue.get() self.result.append(sftp.make_file_info(file_path)) self.on_success(self.status_id, data=self.result, pid=self.pid, pname=self.name) else: result = { "error": True, "message": "Operation timeout exceeded", "traceback": "" } self.on_error(self.status_id, result, pid=self.pid, pname=self.name) except Exception as e: result = { "error": True, "message": str(e), "traceback": traceback.format_exc() } self.on_error(self.status_id, result, pid=self.pid, pname=self.name) def worker(self, re_text, file_queue, result_queue, timeout): try: worker_sftp = self.get_sftp_connection(self.session) while int(time.time()) < timeout: if file_queue.empty() is not True: f_path = file_queue.get() try: if not worker_sftp.is_binary(f_path): mime = mimetypes.guess_type(f_path)[0] # исключаем некоторые mime типы из поиска if mime not in ['application/pdf', 'application/rar']: with worker_sftp.open(f_path, 'rb') as fp: for line in fp: try: line = as_unicode(line) except UnicodeDecodeError: charset = chardet.detect(line) if charset.get('encoding') in ['MacCyrillic']: detected = 'windows-1251' else: detected = charset.get('encoding') if detected is None: break try: line = str(line, detected, "replace") except LookupError: pass if re_text.match(line) is not None: result_queue.put(f_path) self.logger.debug("matched file = %s " % f_path) break except UnicodeDecodeError as unicode_e: self.logger.error( "UnicodeDecodeError %s, %s" % (str(unicode_e), traceback.format_exc())) except IOError as io_e: self.logger.error("IOError %s, %s" % (str(io_e), traceback.format_exc())) except Exception as other_e: self.logger.error("Exception %s, %s" % (str(other_e), traceback.format_exc())) finally: file_queue.task_done() else: time.sleep(REQUEST_DELAY) worker_sftp.close() except Exception as e: result = { "error": True, "message": str(e), "traceback": traceback.format_exc() } self.logger.error('SFTP FindText Worker Exception {}'.format(result))
def process_pool_executor_handler(executor: ProcessPoolExecutor, manager: DownloadProcess, file_maps: Dict[str, str], directory: str) -> None: done_queue = JoinableQueue() def update_hook(future: Future): temp = future.result() if temp: for failed_links in temp: done_queue.put(failed_links) while manager.done_retries != manager.max_retries: print( f"Starting download {manager.get_total_links() - manager.get_total_downloaded_links_count()} links left" ) available_cpus = [0, 1, 2, 3 ] if platform.system() == "Windows" else list( os.sched_getaffinity(os.getpid())) print( f"available cpu's {available_cpus}, initializing {4 * manager.get_process_num()}" f" threads with {manager.get_thread_num()} links per " f"process") if len(manager.error_links): download_links = manager.error_links.copy() manager.error_links = [] else: download_links = manager.get_download_links().copy() process_futures: List[Future] = [] start = 0 for temp_num in range(len(download_links)): end = start + manager.get_thread_num() if end > len(download_links): end = len(download_links) cpu_num = available_cpus[temp_num % len(available_cpus)] process_futures.append( executor.submit(start_threads, download_links[start:end], file_maps, manager.get_session(), directory, manager.http2, manager.debug, cpu_num)) process_futures[-1].add_done_callback(update_hook) start = end if end >= len(download_links): break wait(process_futures) while not done_queue.empty(): link = done_queue.get() manager.error_links.append(link) manager.set_total_downloaded_links_count(manager.get_total_links() - len(manager.error_links)) if manager.debug: print( f"Total downloaded links {manager.get_total_downloaded_links_count()}" ) print(f"Error links generated {len(manager.error_links)}") if len(manager.error_links): manager.set_thread_num( int( ceil((manager.get_total_links() - manager.get_total_downloaded_links_count()) / manager.get_process_num()))) print( f"{manager.get_total_links()} was expected but " f"{manager.get_total_downloaded_links_count()} was downloaded." ) manager.done_retries += 1 print(f"Trying retry {manager.done_retries}") else: break
class VideoProcessor: def __init__(self, config: VideoProcessorConfig): self.config = config self.ballsQueue = Queue() self.cueQueue = Queue() self.ballThrottle = JoinableQueue() self.cueThrottle = JoinableQueue() self.eventQueueVP = Queue() self.eventQueueBP = Queue() self.eventQueueCP = Queue() self.event = None self.vcap = None self.vrec = None self.frameReadLock = Lock() self.ballProcess = None self.cueProcess = None self.outputModuleProcess = None self.initFrameProcessing: InitialFrameProcessing = None self.classificator = None def capture(self): self.initFrameProcessing = InitialFrameProcessing(self.config) frameWidth = Value('i', 1) frameHeight = Value('i', 1) ballProcessorConfig = BallProcessorConfig( self.config.width, self.config.height, frameWidth, frameHeight, 8, 12, 22, 1.0, 24, 90, 10, ) ballProcessorConfig.genDataSet = self.config.genDataSet ballProcessorConfig.genDataSetFolder = self.config.genDataSetFolder cueProcessorConfig = CueProcessorConfig(self.config.width, self.config.height, frameWidth, frameHeight) cueProcessorConfig.genDataSet = self.config.genDataSet cueProcessorConfig.genDataSetFolder = self.config.genDataSetFolder sharedFrame = RawArray(np.ctypeslib.as_ctypes_type(np.uint8), self.config.get_flat_shape()) sharedAvgFrame = RawArray(np.ctypeslib.as_ctypes_type(np.uint8), self.config.get_flat_shape()) numpyFrame = np.frombuffer(sharedFrame, dtype=np.uint8).reshape( self.config.get_flat_shape()) numpyAvgFrame = np.frombuffer(sharedAvgFrame, dtype=np.uint8).reshape( self.config.get_flat_shape()) self.ballProcess = BallProcessor(self.ballsQueue, self.ballThrottle, sharedFrame, sharedAvgFrame, self.frameReadLock, ballProcessorConfig, self.eventQueueBP) self.cueProcess = CueProcessor(self.cueQueue, self.cueThrottle, sharedFrame, sharedAvgFrame, self.frameReadLock, cueProcessorConfig, self.eventQueueCP) self.outputModuleProcess = OutputModule(self.ballsQueue, self.cueQueue, self.eventQueueVP, self.eventQueueBP, self.eventQueueCP, self.config.webPort) self.ballProcess.start() # Póki nie ma implementacji nie może kręcić się na sucho self.cueProcess.start() self.outputModuleProcess.start() try: self.vcap = cv2.VideoCapture( "udp://0.0.0.0:" + str(self.config.udpPort) + "?overrun_nonfatal=1", cv2.CAP_FFMPEG) self.vcap.set(cv2.CAP_PROP_BUFFERSIZE, 0) while (1): self.eventHandling() ret, frame = self.vcap.read() if ret: cv2.imshow('VP: ORIGINAL', frame) self.initFrameProcessing.on_frame(frame) #self.initFrameProcessing.display_components() w, h = self.initFrameProcessing.get_pool_size() frame = self.initFrameProcessing.get_warped_frame( ).flatten() frame = np.resize(frame, self.config.get_flat_shape()) avg_frame = self.initFrameProcessing.get_avg_frame( ).flatten() avg_frame = np.resize(avg_frame, self.config.get_flat_shape()) # chwilowe, bo wywala się gdy podczas wykrywania stołu jest ten fragment nagrania bez stołu if w < 500 or h < 500: continue with self.frameReadLock: frameWidth.value = w frameHeight.value = h np.copyto(numpyFrame, frame) np.copyto(numpyAvgFrame, avg_frame) if not self.ballThrottle.empty(): self.ballThrottle.get() self.ballThrottle.task_done() if not self.cueThrottle.empty(): self.cueThrottle.get() self.cueThrottle.task_done() # Main wait to refresh windows c = cv2.waitKey(1) except (KeyboardInterrupt, SystemExit): print("VP: Interrupt") self.cleanup() self.terminate() print("VP: Exit") sys.exit(0) def record(self): try: self.vrec = cv2.VideoWriter( self.config.recordingPath, cv2.VideoWriter_fourcc(*'MP4V'), self.config.recordingFps, (self.config.width, self.config.height)) self.vcap = cv2.VideoCapture( "udp://0.0.0.0:" + str(self.config.udpPort), cv2.CAP_FFMPEG) while (1): ret, frame = self.vcap.read() if frame is not None: self.vrec.write(frame) cv2.imshow('VIDEO', frame) c = cv2.waitKey(1) if c & 0xFF == ord('q'): self.cleanup() break except (KeyboardInterrupt, SystemExit): self.cleanup() sys.exit(0) def eventHandling(self): while not self.eventQueueVP.empty(): event = self.eventQueueVP.get_nowait() print("VP: ", event.eventType) if isinstance(event, RerunInitRequestEvent): self.initFrameProcessing.reset_avg() elif isinstance(event, InitDurationChangeEvent): self.config.initDuration = int(event.initDuration) elif isinstance(event, PoolColorsChangeEvent): self.config.pool_color_range = event.pool_color_range def cleanup(self): if self.vrec is not None: self.vrec.release() if self.vcap is not None: self.vcap.release() def terminate(self): self.ballProcess.terminate() self.ballProcess.join() self.cueProcess.terminate() self.cueProcess.join() self.outputModuleProcess.kill()
def main(): jobs = JoinableQueue() result = JoinableQueue() numToProcess = -1 scores = pd.DataFrame(columns=['fmeasure','precision','recall', 'numResult','maxDistance','topHits', 'hits', "contextSteps"]) manual_annotations = get_manual_annotations(numToProcess) manual_tuples = get_ir_tuples(manual_annotations) print len(manual_annotations) for i in range(weighted_kmeans_clustering_passes): print "Training pass",i+1 train_kmeans(manual_annotations.keys(), target_class_subtree) print "Complete." for key in manual_annotations.keys(): jobs.put(key) processed_count = Counter() for i in xrange(NUMBER_OF_PROCESSES): p = Process(target=work, args=(i, jobs, result, processed_count)) p.daemon = True p.start() #work(1, jobs, result, processed_count) automated_annotations = {} distances = {} jobs.join() while not result.empty(): dataset, classes = result.get() automated_annotations[dataset] = set(classes.keys()) distances[dataset] = classes result.task_done() automated_tuples = get_ir_tuples(automated_annotations) hits = manual_tuples & automated_tuples misses = manual_tuples - automated_tuples precision = float(len(hits)) / len(automated_tuples) recall = float(len(hits)) / len(manual_tuples) fmeasure = 2 * (precision * recall) / (precision + recall) # print '\t'.join([str(x) for x in [precision, recall, fmeasure, # numResult, minScore, topHits]]) scores = scores.append(dict(precision=precision, recall=recall, fmeasure=fmeasure, hits=len(manual_tuples),topHits=topHits, maxDistance=maxDistance, contextSteps = context_steps), ignore_index=True) print '\n' print scores results_file = 'results.csv' if len(sys.argv) > 1: results_file = sys.argv[1] hit_curves = csv.writer(open(results_file,'wb'),delimiter=",") hit_curves.writerow(['dataset','class','distance','hit']) for dataset, c in automated_tuples: distance = round(distances[dataset][c],3) hit = 1 if (dataset,c) in manual_tuples else 0 hit_curves.writerow([dataset,c,distance,hit])
class Task: def __init__(self, name, opts, resdir, log): self.__name = name self.__opts = opts self.__resdir = resdir self.__log = log self.__proc = None self.__is_finished = False self.__results = None self.__refs = {} task_mod = __import__(name) installed_version = int(task_mod.Task.version) actual_version = int(self.__opts['version']) if installed_version < actual_version: # update installed task raise Exception("Version {0} for task {1} is too old, task must be updated to {2}".format(installed_version, name, actual_version)) f = io.StringIO() #with redirect_stdout(f): self.__cls = task_mod.Task() #print('Got stdout: "{0}"'.format(f.getvalue())) def __collect_argrefs(self, name2task): if not hasattr(self.__cls, 'refs'): return dict() inrefs = self.__cls.refs if not isinstance(inrefs, dict): raise Exception("Refs are not dict") logger.debug('refs found: {0}'.format(inrefs)) outrefs = {} for name, ref in inrefs.items(): rtask_name, rtask_retval_name = ref.split('.') rtask_res = name2task[rtask_name].get_result() if rtask_retval_name not in rtask_res: raise Exception('Task {0} does not return val {1} referenced by another task'.format(rtask_name, rtask_retval_name)) outrefs[name] = rtask_res[rtask_retval_name] return outrefs @staticmethod def __run_wrapper(functor, args, refs, resdir, log, q, exc_q): try: res = functor(args, refs, resdir, log) except Exception as e: traceback.print_exc() exc_q.put({'Exited by exception': repr(e)}) exc_q.task_done() else: if not isinstance(res, dict): res = {'result': res} q.put(res) q.task_done() def run(self, name2task): refs = self.__collect_argrefs(name2task) self.__q = JoinableQueue() self.__exc_q = JoinableQueue() self.__proc = Process(target=Task.__run_wrapper, args=(self.__cls, self.__opts['args'], refs, self.__resdir, self.__log, self.__q, self.__exc_q)) self.__proc.start() def is_alive(self): return self.__proc is not None and self.__proc.is_alive() def probe(self): if self.__proc is None or self.__proc.is_alive(): return self.__q.join() self.__exc_q.join() if not self.__exc_q.empty(): self.__results = self.__exc_q.get() elif not self.__q.empty(): self.__results = self.__q.get() else: self.__results = {} self.__proc.join() self.__proc = None self.__is_finished = True def is_finished(self): return self.__is_finished def get_result(self): return self.__results def get_name(self): return self.__name def get_log(self): with open(self.__log) as f: data = f.read() return data
class MultiThreadedFlickrCrawler: ########################################################################### # System parameters and initializations ########################################################################### def __init__(self, cfg, category, max_num_images, communication_q, rate_limit): self.cfg = cfg self.category = category argv = self.cfg.vars self.communication_q = communication_q self.do_exit = False self.rate_limit = rate_limit self.rate_q = Queue() # flickr auth information: change these to your flickr api keys and secret self.flickrAPIkeys = argv["flickrAPIkeys"].split(', ') # API key self.flickrAPIsecrets = argv["flickrAPIsecrets"].split(', ') # shared "secret" self.queryFileName = argv["queryFileName"] #'query_terms.txt' self.homeDir = argv["homeDir"] self.imagesPerDir = int(argv["imagesPerDir"]) self.flickrerrors = 0 # Crawler parameters self.resultsPerPage = int(argv["resultsPerPage"]) self.downloadsPerQuery = int(argv["downloadsPerQuery"]) self.numberOfThreads = int(argv["numberOfThreads"]) self.startTime = int(argv["crawlerBeginTime"]) #1072915200 # 01/01/2004 self.finalTime = int(time.time()) self.singleDay = 86400 # 24hr*60min*60sec = 1day self.max_num_images = max_num_images self.database = argv["databaseName"] # Structures Initialization self.dbdir = DBDirectories(self.homeDir, argv["sysdir"], category) self.indexOfUniqueImages = self.dbdir.inf + 'imageIndex.txt' self.indexOfUniqueUsers = self.dbdir.inf + 'usersIndex.txt' self.recentUsers = dict() self.queryTerms = [] # Multithreaded downloading of images self.queue = JoinableQueue() self.out_queue = JoinableQueue() self.threadsList = [] for i in range(self.numberOfThreads): t = DownloadImageThread(self.queue, self.out_queue, self.dbdir.img, self.dbdir.txt, self.category, self.database) t.setDaemon(True) t.start() self.threadsList.append(t) print(("{} initialized".format(self.category))) ########################################################################### # Method to load query terms ########################################################################### def loadQueries(self): # Each term is a category self.queryTerms = [self.category] print(('positive queries:', self.queryTerms)) list(map(lambda t: t.setValidTags(self.queryTerms), self.threadsList)) return len(self.queryTerms) ########################################################################### # Method to load index of image names ########################################################################### def loadImageNamesIndex(self): print('Loading index of images') if os.path.exists(self.indexOfUniqueImages): self.allImageNames = dict( [(img.replace('\n', ''), True) for img in open(self.indexOfUniqueImages).readlines()]) print(('Index with', len(self.allImageNames), 'names is ready to use')) else: self.allImageNames = dict() print(('No previous index found at {}'.format(self.indexOfUniqueImages))) print('Loading index of users') if os.path.exists(self.indexOfUniqueUsers): self.recentUsers = dict([(usr.replace('\n', ''), 1) for usr in open(self.indexOfUniqueUsers).readlines()]) print(('Index with', len(self.recentUsers), 'users is ready to use')) else: self.recentUsers = dict() print(('No previous user index found at {}'.format(self.indexOfUniqueUsers))) ########################################################################### # Find out if an image is a duplicate or of a user already visited ########################################################################### def isDuplicateImage(self, flickrResult): b = flickrResult owner_date = b['owner'] + '_' + simpleDateFormat(b['datetaken']) imgName = b['server'] + '_' + b['id'] + '_' + b['secret'] + '_' + owner_date + '.jpg' alreadyIndexed = False userPhotos = 0 if imgName in self.allImageNames: alreadyIndexed = self.allImageNames[imgName] else: self.allImageNames[imgName] = False if owner_date in self.recentUsers: userPhotos = self.recentUsers[owner_date] else: self.recentUsers[owner_date] = 0 if (not alreadyIndexed) and userPhotos < 1: self.recentUsers[owner_date] += 1 self.allImageNames[imgName] = True return False else: return True ########################################################################### #Find out if medium format of photo exists for download ########################################################################### def get_url(self, flickrResult, fapi, size): url = "https://farm{}.staticflickr.com/{}/{}_{}.jpg".format(flickrResult['farm'], flickrResult['server'], flickrResult['id'], flickrResult['secret']) return True, url #TODO find way to speed up actual url retrieval # image_id = flickrResult['id'] # success = False # try: # rsp = fapi.photos_getSizes(api_key=self.flickrAPIKey, photo_id=image_id) # fapi.testFailure(rsp) # except: # print sys.exc_info()[0] # print ('Exception encountered while querying for urls\n') # else: # if getattr(rsp, 'sizes', None): # if int(rsp.sizes[0]['candownload']) == 1: # if getattr(rsp.sizes[0], 'size', None): # for image_size in rsp.sizes[0].size: # if image_size['label'] == size: # return True, image_size['source'] # # return False, "" ########################################################################### # Update index of unique image names ########################################################################### def updateImageNamesIndex(self, newImages): with open(self.indexOfUniqueImages, 'a') as indexFile: for img in newImages: indexFile.write(img + '\n') self.allImageNames = [] ########################################################################### # Main Method. This runs the crawler in an infinite loop ########################################################################### def start(self): socket.setdefaulttimeout(30) #30 second time out on sockets before they throw self.cfg.log(self.homeDir, "CRAWLER STARTED") while not self.do_exit: try: command = self.communication_q.get(False) except Empty as e: #Randomly choose flickrAPIkeys and flickrAPIsecrets currentKey = int(math.floor(random.random()*len(self.flickrAPIkeys))) # make a new FlickrAPI instance fapi = FlickrAPI(self.flickrAPIkeys[currentKey], self.flickrAPIsecrets[currentKey]) num_queries = self.loadQueries() if num_queries == 0: break newImages = [] # Set time variables self.finalTime = int(time.time()) currentTimeWindow = self.finalTime - self.startTime mintime = self.startTime + random.randint(0, currentTimeWindow) maxtime = mintime + 3 * self.singleDay print(('Since:', datetime.fromtimestamp(mintime))) print(('Until:', datetime.fromtimestamp(maxtime))) print(('Previous Users:', len(self.recentUsers))) self.loadImageNamesIndex() if len(self.allImageNames) > self.max_num_images: print("Max Images reached") break # Search Images using the query terms for current_tag in range(0, num_queries): dirNumName = self.dbdir.uploadCurrentDirAndGetNext(self.imagesPerDir, self.queryTerms) print(("Current Directory Number: ", dirNumName)) #form the query string. query_string = self.queryTerms[current_tag] print(('\n\nquery_string is ' + query_string)) #only visit 8 pages max, to try and avoid the dreaded duplicate bug. #8 pages * 250 images = 2000 images, should be duplicate safe. Most interesting pictures will be taken. num_visit_pages = 16 pagenum = 1 while ( pagenum <= num_visit_pages ): if (self.rate_q.qsize()>self.rate_limit): #Age out time stamps older than one hour found_all = False while(not found_all): next_stamp = self.rate_q.get() if time.time() - next_stamp < 3600: found_all = True self.rate_q.put(next_stamp) #Wait to age out time stamps if exceeded rate limit if (self.rate_q.qsize()>self.rate_limit): next_stamp = self.rate_q.get() remaining_time = 3600 - (time.time() - next_stamp) time.sleep(remaining_time) self.rate_q.put(time.time()+60) try: rsp = fapi.photos_search(api_key=self.flickrAPIkeys[currentKey], ispublic="1", media="photos", per_page=str(self.resultsPerPage), page=str(pagenum), sort="interestingness-desc", text=query_string, extras="tags, original_format, license, geo, date_taken, date_upload, o_dims, views, description", min_upload_date=str(mintime), max_upload_date=str(maxtime)) fapi.testFailure(rsp) except KeyboardInterrupt: print('Keyboard exception while querying for images, exiting\n') raise except (IOError, SSLError) as e: print(('Error on Flickr photo request:{}\n'.format(e.strerror))) except FlickrExpatError as e: print(('Exception encountered while querying for images: {}\n'.format(e.message))) print(('{}: {} to {} page {}\n'.format(query_string, mintime, maxtime, pagenum))) print((e.xmlstr)) #I've identified two possible causes of this error: (1)Bad Gateway and (2)bad unicode characters in xml time.sleep(5) #Waiting is best cure for bad gateway pagenum = pagenum + 1 #Skipping to next page is best cure for bad character #Just in case it has some connection to the rate limit, change the key #Randomly choose flickrAPIkeys and flickrAPIsecrets currentKey = int(math.floor(random.random()*len(self.flickrAPIkeys))) # make a new FlickrAPI instance fapi = FlickrAPI(self.flickrAPIkeys[currentKey], self.flickrAPIsecrets[currentKey]) self.flickrerrors += 1 if self.flickrerrors > 5: print(("Too many Flickr Expat Errors in {}: Exiting".format(self.category))) exit(1) except Exception as e: print((sys.exc_info()[0])) print('Exception encountered while querying for images\n') else: # Process results if getattr(rsp, 'photos', None): if getattr(rsp.photos[0], 'photo', None): random.shuffle(rsp.photos[0].photo) for k in range(0, min(self.downloadsPerQuery, len(rsp.photos[0].photo))): b = rsp.photos[0].photo[k] if not self.isDuplicateImage(b): isDownloadable, url = self.get_url(b, fapi, "Medium 640") if isDownloadable: b["url"] = url self.queue.put((b, dirNumName)) print('Waiting threads') self.queue.join() while not self.out_queue.empty(): newImages.append(self.out_queue.get()) print((len(newImages), ' downloaded images')) pagenum = pagenum + 1 #this is in the else exception block. It won't increment for a failure. num_visit_pages = min(4, int(rsp.photos[0]['pages'])) # End While of Pages # BEGIN: PROCESS DOWNLOADED IMAGES self.updateImageNamesIndex(newImages) else: if command == "exit": self.do_exit = True print(("Wait for safe exit {}".format(self.category))) print('End') self.cfg.log(self.homeDir, "CRAWLER STOPPED")
class WebDav: NUM_WORKING_PROCESSES = 5 def __init__(self, host, user, passwd, timeout=-999, logger=None): self.fp = dict() webdav_host = host self.webdav_host = webdav_host self.host = host self.user = user self.passwd = passwd self.processes = [] self.file_queue = JoinableQueue(maxsize=0) self.result_queue = Queue(maxsize=0) self.is_alive = { "status": True } options = { 'webdav_hostname': self.webdav_host, 'webdav_login': self.user, 'webdav_password': self.passwd } self.webdavClient = wc.Client(options) self.logger = logger self._tzinfo = TimeZoneMSK() def parent(self, path): return urn.Urn(path).parent() def path(self, path): return urn.Urn(path).path() def generate_file_info(self, file_path): info = self.webdavClient.info(file_path) is_dir = False is_link = False if self.webdavClient.is_dir(file_path): is_dir = True else: pass file_name = urn.Urn(file_path).filename().replace("/", "") file_dir = urn.Urn(file_path).parent() ext = '' divide = file_name.split('.') if len(divide) > 1: ext = file_name.split('.')[-1].lower() mtime = info['modified'] file_info = { "is_dir": is_dir, "is_link": is_link, "name": file_name, "ext": ext, "path": file_dir, "owner": self.user, "mode": "600", "size": info['size'] if not is_dir else 0, "mtime": mtime, 'mtime_str': str(mtime), } return file_info def _make_file_info(self, file_queue, result_queue, logger, timeout): while int(time.time()) < timeout: if file_queue.empty() is not True: file_path = file_queue.get() try: file_info = self.generate_file_info(file_path) result_queue.put(file_info) except UnicodeDecodeError as unicode_e: logger.error( "UnicodeDecodeError %s, %s" % (str(unicode_e), traceback.format_exc())) except IOError as io_e: logger.error("IOError %s, %s" % (str(io_e), traceback.format_exc())) except Exception as other_e: logger.error("Exception %s, %s" % (str(other_e), traceback.format_exc())) finally: file_queue.task_done() else: time.sleep(REQUEST_DELAY) @staticmethod def to_byte(value): if isinstance(value, str): try: value = value.encode("utf-8") except UnicodeDecodeError: value = value.encode("ISO-8859-1") return value def size(self, path): try: return self.webdavClient.info(path)['size'] except Exception as e: self.logger.error("Error in WebDav size(): %s, traceback = %s" % (str(e), traceback.format_exc())) return 0 def info(self, path): return self.webdavClient.info(self.to_byte(path)) def exists(self, path): return self.webdavClient.check(path) def isdir(self, path): return self.webdavClient.is_dir(path) def isfile(self, path): return not self.webdavClient.is_dir(self.to_byte(path)) def list(self, path): flist = { "path": path, "items": [] } try: self.webdavClient.check('/') except Exception: raise Exception("Error during establishing webdav connection") listdir = self.webdavClient.list(self.to_byte(path)) self.logger.info("listdir=%s", listdir) time_limit = int(time.time()) + TIMEOUT_LIMIT self.file_queue = JoinableQueue(maxsize=0) self.result_queue = Queue(maxsize=0) for i in range(self.NUM_WORKING_PROCESSES): p = Process(target=self._make_file_info, args=(self.file_queue, self.result_queue, self.logger, time_limit)) p.start() proc = psutil.Process(p.pid) proc.ionice(psutil.IOPRIO_CLASS_IDLE) proc.nice(20) self.logger.debug( "ListDir worker #%s, set ionice = idle and nice = 20 for pid %s" % ( str(i), str(p.pid))) self.processes.append(p) for name in listdir: try: item_path = '{0}/{1}'.format(path, name) self.file_queue.put(item_path) except UnicodeDecodeError as e: self.logger.error( "UnicodeDecodeError %s, %s" % (str(e), traceback.format_exc())) except IOError as e: self.logger.error("IOError %s, %s" % (str(e), traceback.format_exc())) except Exception as e: self.logger.error( "Exception %s, %s" % (str(e), traceback.format_exc())) while not self.file_queue.empty(): self.logger.debug("file_queue size = %s , empty = %s (timeout: %s/%s)" % ( self.file_queue.qsize(), self.file_queue.empty(), str(int(time.time())), time_limit)) time.sleep(REQUEST_DELAY) if self.file_queue.empty(): self.logger.debug("join() file_queue until workers done jobs") self.file_queue.join() for p in self.processes: try: self.logger.debug("WebDav ListDir terminate worker process, pid = %s" % p.pid) kill(p.pid, signal.SIGKILL, self.logger) except OSError: self.logger.error( "ListDir unable to terminate worker process, pid = %s" % p.pid) if self.is_alive['status'] is True: while not self.result_queue.empty(): file_info = self.result_queue.get() flist["items"].append(file_info) return flist def listdir(self, path): listdir = self.webdavClient.list(path) listing = [] for name in listdir: item_path = '{0}/{1}'.format(path, name) listing.append(item_path) return listing def remove(self, target): try: self.logger.debug("Removing target=%s" % target) if self.isdir(target): target += '/' self.webdavClient.unpublish(target) self.webdavClient.clean(target) except Exception as e: self.logger.error("Error in WebDav dir remove(): %s, traceback = %s" % (str(e), traceback.format_exc())) raise Exception def mkdir(self, path): self.logger.debug("Creating directory=%s" % path) return self.webdavClient.mkdir(path) def upload(self, source, target, overwrite=False, rename=None, operation_progress=None): result = {} file_list = {} succeed = [] failed = [] try: if rename is not None: target_path = os.path.join(target, rename) else: target_path = os.path.join(target, source) if not overwrite and self.exists(target_path): failed.append(source) raise Exception("File '%s' already exists and overwrite not permitted" % target_path) try: self.logger.debug("Uploading target_path=%s, source=%s" % (target_path, source)) self.webdavClient.upload(target_path, source, operation_progress) except Exception as e: failed.append(source) self.logger.error("Error in WebDav upload(): %s, traceback = %s" % (str(e), traceback.format_exc())) raise Exception("Error during file uploading %s" % traceback.format_exc()) succeed.append(source) file_list['succeed'] = succeed file_list['failed'] = failed result['success'] = True result['error'] = None result['file_list'] = file_list return result except Exception as e: self.logger.error("Error in WebDav upload(): %s, traceback = %s" % (str(e), traceback.format_exc())) file_list['succeed'] = succeed file_list['failed'] = failed result['success'] = False result['error'] = e result['file_list'] = file_list return result def download(self, source, target, operation_progress=None): result = {} file_list = {} succeed = [] failed = [] try: target_path = os.path.join(target, os.path.basename(source)) try: self.logger.debug("Downloading source=%s, target_path=%s" % (source, target_path)) self.webdavClient.download(source, target_path, operation_progress) except Exception as e: failed.append(source) self.logger.error("Error in WebDav download(): %s, traceback = %s" % (str(e), traceback.format_exc())) raise Exception("Error during file download") succeed.append(source) file_list['succeed'] = succeed file_list['failed'] = failed result['success'] = True result['error'] = None result['file_list'] = file_list return result except Exception as e: self.logger.error("Error in WebDav download(): %s, traceback = %s" % (str(e), traceback.format_exc())) file_list['succeed'] = succeed file_list['failed'] = failed result['success'] = False result['error'] = e result['file_list'] = file_list return result def copy_file(self, source, target, overwrite=False): result = {} file_list = {} succeed = [] failed = [] try: if not overwrite and self.exists(target): failed.append(source) raise Exception('file exist and cannot be overwritten') try: self.logger.debug("Copying file source=%s, target=%s" % (source, target)) self.webdavClient.copy(source, target) except Exception as e: failed.append(source) raise Exception('Cannot copy file %s' % (e,)) succeed.append(source) file_list['succeed'] = succeed file_list['failed'] = failed result['success'] = True result['error'] = None result['file_list'] = file_list return result except Exception as e: file_list['succeed'] = succeed file_list['failed'] = failed result['success'] = False result['error'] = e result['file_list'] = file_list return result def move_file(self, source, target, overwrite=False): result = {} file_list = {} succeed = [] failed = [] try: if not overwrite and self.exists(target): failed.append(source) raise Exception('file exist and cannot be overwritten') try: self.logger.debug("Moving file source=%s, target=%s" % (source, target)) self.webdavClient.move(source, target) except Exception as e: failed.append(source) raise Exception('Cannot move file %s' % (e,)) succeed.append(source) file_list['succeed'] = succeed file_list['failed'] = failed result['success'] = True result['error'] = None result['file_list'] = file_list return result except Exception as e: file_list['succeed'] = succeed file_list['failed'] = failed result['success'] = False result['error'] = e result['file_list'] = file_list return result def make_destination_dir(self, destination, overwrite): self.logger.info("making destination %s" % destination) if not self.exists(destination): self.mkdir(destination) elif overwrite and self.exists(destination) and not self.isdir(destination): self.remove(destination) self.mkdir(destination) elif not overwrite and self.exists(destination) and not self.isdir(destination): raise Exception("destination is not a dir") else: pass
def main(): jobs = JoinableQueue() result = JoinableQueue() numToProcess = -1 scores = pd.DataFrame(columns=[ 'fmeasure', 'precision', 'recall', 'numResult', 'maxDistance', 'topHits', 'contentWeight', 'relationWeight', 'hits', "contextSteps" ]) manual_annotations = get_manual_annotations(numToProcess) manual_tuples = get_ir_tuples(manual_annotations) print len(manual_annotations) for i in range(weighted_kmeans_clustering_passes): print "Training pass", i + 1 train_kmeans(manual_annotations.keys(), target_class_subtree) print "Complete." for key in manual_annotations.keys(): jobs.put(key) processed_count = Counter() for i in xrange(NUMBER_OF_PROCESSES): p = Process(target=work, args=(i, jobs, result, processed_count)) p.daemon = True p.start() #work(1, jobs, result, processed_count) automated_annotations = {} distances = {} jobs.join() while not result.empty(): dataset, classes = result.get() automated_annotations[dataset] = set(classes.keys()) distances[dataset] = classes result.task_done() automated_tuples = get_ir_tuples(automated_annotations) hits = manual_tuples & automated_tuples misses = manual_tuples - automated_tuples precision = float(len(hits)) / len(automated_tuples) recall = float(len(hits)) / len(manual_tuples) fmeasure = 2 * (precision * recall) / (precision + recall) # print '\t'.join([str(x) for x in [precision, recall, fmeasure, # numResult, minScore, topHits]]) scores = scores.append(dict(precision=precision, recall=recall, fmeasure=fmeasure, hits=len(manual_tuples), topHits=topHits, maxDistance=maxDistance, contextSteps=context_steps), ignore_index=True) print '\n' print scores results_file = 'results.csv' if len(sys.argv) > 1: results_file = sys.argv[1] hit_curves = csv.writer(open(results_file, 'wb'), delimiter=",") hit_curves.writerow(['dataset', 'class', 'distance', 'hit']) for dataset, c in automated_tuples: distance = round(distances[dataset][c], 3) hit = 1 if (dataset, c) in manual_tuples else 0 hit_curves.writerow([dataset, c, distance, hit])
class FileIO(BaseObj): """FileIO object Usage: from nfstest.file_io import FileIO # Instantiate FileIO object given top level directory x = FileIO(datadir="/tmp/data") # Run workload creating the top level directory if necessary x.run() """ def __init__(self, **kwargs): """Constructor Initialize object's private data datadir: Top level directory where files will be created, it will be created if it does not exist seed: Seed to initialized the random number generator [default: automatically generated] nprocs: Number of processes to use [default: 1] runtime: Run time [default: 0 (indefinitely)] verbose: Verbose level: none|info|debug|dbg1-7|all [default: 'none'] exiterr: Exit on first error [default: False] read: Read file percentage [default: 40] write: Write file percentage [default: 40] rdwr: Read/write file percentage [default: 20] randio: Random file access percentage [default: 50] iodelay: Seconds to delay I/O operations [default: 0.0] direct: Use direct I/O [default: False] rdwronly: Use read and write only, no rename, remove, etc. [default: False] create: Create file percentage [default: 5] odgrade: Open downgrade percentage [default: 10] osync: Open file with O_SYNC [default: 20] fsync: Percentage of fsync after write [default: 5] rename: Rename file percentage [default: 5] remove: Remove file percentage [default: 5] trunc: Truncate file percentage [default: 5] ftrunc: Truncate opened file percentage [default: 5] link: Create hard link percentage [default: 2] slink: Create symbolic link percentage [default: 1] readdir: List contents of directory percentage [default: 1] lock: Lock file percentage [default: 20] unlock: Unlock file percentage [default: 80] tlock: Lock test percentage [default: 50] lockfull: Lock full file percentage [default: 50] minfiles: Mininum number of files to create before any file operation is executed [default: 10] fsizeavg: File size average [default: 1m] fsizedev: File size standard deviation [default: 256k] rsize: Read block size [default: 64k] rsizedev: Read block size standard deviation [default: 8k] wsize: Write block size [default: 64k] wsizedev: Write block size standard deviation [default: 8k] sizemult: Size multiplier [default: 1] createlog: Create log file [default: False] createlogs: Create a log file for each process [default: False] logdir: Log directory [default: '/tmp'] """ self.progname = os.path.basename(sys.argv[0]) self.datadir = kwargs.pop("datadir", None) self.seed = kwargs.pop("seed", P_SEED) self.nprocs = kwargs.pop("nprocs", P_NPROCS) self.runtime = kwargs.pop("runtime", P_RUNTIME) self.verbose = kwargs.pop("verbose", P_VERBOSE) self.createlog = kwargs.pop("createlog", P_CREATELOG) self.createlogs = kwargs.pop("createlogs", P_CREATELOGS) self.create = kwargs.pop("create", P_CREATE) self.osync = kwargs.pop("osync", P_OSYNC) self.fsync = kwargs.pop("fsync", P_FSYNC) self.read = kwargs.pop("read", None) self.write = kwargs.pop("write", None) self.rdwr = kwargs.pop("rdwr", None) self.odgrade = kwargs.pop("odgrade", P_ODGRADE) self.randio = kwargs.pop("randio", P_RANDIO) self.rdwronly = kwargs.pop("rdwronly", P_RDWRONLY) self.iodelay = kwargs.pop("iodelay", P_IODELAY) self.direct = kwargs.pop("direct", P_DIRECT) self.logdir = kwargs.pop("logdir", P_TMPDIR) self.exiterr = kwargs.pop("exiterr", False) self.minfiles = kwargs.pop("minfiles", str(MIN_FILES)) if self.datadir is None: print "Error: datadir is required" sys.exit(2) data = [int(x) for x in self.minfiles.split(",")] if len(data) == 1: self.up_minfiles = -1 self.top_minfiles = data[0] self.bot_minfiles = data[0] elif len(data) > 1: self.up_minfiles = 0 self.top_minfiles = max(data) self.bot_minfiles = min(data) else: print "Error: option minfiles must be an integer or two integers separated by a ',': %s" % self.minfiles sys.exit(2) self.minfiles = self.top_minfiles if self.rdwronly: # When rdwronly option is given, set all options for manipulating # files to zero if not explicitly given self.rename = kwargs.pop("rename", 0) self.remove = kwargs.pop("remove", 0) self.trunc = kwargs.pop("trunc", 0) self.ftrunc = kwargs.pop("ftrunc", 0) self.link = kwargs.pop("link", 0) self.slink = kwargs.pop("slink", 0) self.readdir = kwargs.pop("readdir", 0) self.lock = kwargs.pop("lock", 0) self.unlock = kwargs.pop("unlock", 0) self.tlock = kwargs.pop("tlock", 0) self.lockfull = kwargs.pop("lockfull", 0) else: self.rename = kwargs.pop("rename", P_RENAME) self.remove = kwargs.pop("remove", P_REMOVE) self.trunc = kwargs.pop("trunc", P_TRUNC) self.ftrunc = kwargs.pop("ftrunc", P_FTRUNC) self.link = kwargs.pop("link", P_LINK) self.slink = kwargs.pop("slink", P_SLINK) self.readdir = kwargs.pop("readdir", P_READDIR) self.lock = kwargs.pop("lock", P_LOCK) self.unlock = kwargs.pop("unlock", P_UNLOCK) self.tlock = kwargs.pop("tlock", P_TLOCK) self.lockfull = kwargs.pop("lockfull", P_LOCKFULL) # Get size multiplier self.sizemult = convert_str(kwargs.pop("sizemult", P_SIZEMULT)) # Convert sizes and apply multiplier self.fsizeavg = int(self.sizemult * convert_str(kwargs.pop("fsizeavg", P_FILESIZE))) self.fsizedev = int(self.sizemult * convert_str(kwargs.pop("fsizedev", P_FSIZEDEV))) self.rsize = int(self.sizemult * convert_str(kwargs.pop("rsize", P_RSIZE))) self.wsize = int(self.sizemult * convert_str(kwargs.pop("wsize", P_WSIZE))) self.rsizedev = int(self.sizemult * convert_str(kwargs.pop("rsizedev", P_RSIZEDEV))) self.wsizedev = int(self.sizemult * convert_str(kwargs.pop("wsizedev", P_WSIZEDEV))) if self.direct: # When using direct I/O, use fixed read/write block sizes self.rsizedev = 0 self.wsizedev = 0 # Initialize counters self.rbytes = 0 self.wbytes = 0 self.nopen = 0 self.nopendgr = 0 self.nosync = 0 self.nclose = 0 self.nread = 0 self.nwrite = 0 self.nfsync = 0 self.nrename = 0 self.nremove = 0 self.ntrunc = 0 self.nftrunc = 0 self.nlink = 0 self.nslink = 0 self.nreaddir = 0 self.nlock = 0 self.nunlock = 0 self.ntlock = 0 self.stime = 0 # Set read and write option percentages total = 100 if self.rdwr is None: if self.read is None and self.write is None: # All read and write options are not given, use defaults self.read = P_READ self.write = P_WRITE self.rdwr = P_RDWR elif self.read is None or self.write is None: # If only read or write is given, don't use rdwr self.rdwr = 0 else: # If both read and write are given, set rdwr to add up to 100 self.rdwr = max(0, total - self.read - self.write) else: # Option rdwr is given, calculate remainder left for read and write total -= self.rdwr if self.read is None and self.write is None: # Only rdwr is given, distribute remainder equally # between read and write self.read = int(total/2) self.write = total - self.read elif self.read is None and self.write is not None: # Option rdwr and write are given, set read percentage self.read = total - self.write elif self.read is not None and self.write is None: # Option rdwr and read are given, set write percentage self.write = total - self.read # Verify read and write options add up to 100 percent total = abs(self.read) + abs(self.write) + abs(self.rdwr) if total != 100: print "Total for read, write and rdwr must be == 100" sys.exit(2) # Set verbose level mask self.debug_level(self.verbose) # Set timestamp format to include the date and time self.tstamp(fmt="{0:date:%Y-%m-%d %H:%M:%S.%q} ") self.logbase = None if self.createlog or self.createlogs: # Create main log file datetimestr = self.timestamp("{0:date:%Y%m%d%H%M%S_%q}") logname = "%s_%s" % (self.progname, datetimestr) self.logbase = os.path.join(self.logdir, logname) self.logfile = self.logbase + ".log" self.open_log(self.logfile) # Multiprocessing self.tid = 0 self.queue = None # Memory buffers self.fbuffers = [] self.PAGESIZE = os.sysconf(os.sysconf_names['SC_PAGESIZE']) # Load share library for calling C library functions try: # Linux self.libc = ctypes.CDLL('libc.so.6', use_errno=True) except: # MacOS self.libc = ctypes.CDLL('libc.dylib', use_errno=True) self.libc.malloc.argtypes = [ctypes.c_long] self.libc.malloc.restype = ctypes.c_void_p self.libc.posix_memalign.argtypes = [ctypes.POINTER(ctypes.c_void_p), ctypes.c_long, ctypes.c_long] self.libc.posix_memalign.restype = ctypes.c_int self.libc.read.argtypes = [ctypes.c_int, ctypes.c_void_p, ctypes.c_long] self.libc.read.restype = ctypes.c_int self.libc.write.argtypes = [ctypes.c_int, ctypes.c_void_p, ctypes.c_long] self.libc.write.restype = ctypes.c_int self.libc.lseek.argtypes = [ctypes.c_int, ctypes.c_long, ctypes.c_int] self.libc.lseek.restype = ctypes.c_long self.libc.memcpy.argtypes = [ctypes.c_void_p, ctypes.c_void_p, ctypes.c_long] self.libc.memcpy.restype = ctypes.c_void_p def __del__(self): """Destructor""" if getattr(self, 'logfile', None): print "\nLogfile: %s" % self.logfile def _dprint(self, level, msg): """Local dprint function, if called from a subprocess send the message to the main process, otherwise use dprint on message """ if self.queue and not self.createlogs: # Send message to main process self.queue.put([level,msg]) else: # Display message and send it to the log file self.dprint(level, msg) def _get_tree(self): """Read top level directory for existing files to populate database This is used so it can be run in the same top level directory multiple times """ for entry in os.listdir(self.datadir): # Must match file names given by _newname if not re.search(r'^f[\dA-F]+$', entry): continue # Get tid from file name tid = int(entry[1:self.bidx], 16) if self.tid != tid: continue # Get index from file name and set it index = int(entry[self.bidx:], 16) if self.n_index <= index: self.n_index = index + 1 # Get file size and append it to database absfile = os.path.join(self.datadir, entry) try: fst = os.stat(absfile) size = fst.st_size except: size = 0 fileobj = FileObj(name=entry, size=size) fileobj.debug_repr(1) if os.path.islink(absfile): fileobj.srcname = os.path.basename(os.readlink(absfile)) self.n_files.append(fileobj) def _newname(self): """Create new file name""" name = "%s%06X" % (self.basename, self.n_index) self.n_index += 1 return name def _percent(self, pvalue): """Test percent value""" if pvalue >= 100: return True elif pvalue <= 0: return False return self.random.randint(0,99) < pvalue def _get_fileobj(self): """Get a random file object""" # Number of files available nlen = len(self.n_files) self.findex = self.random.randint(0, nlen-1) return self.n_files[self.findex] def _getiolist(self, size, iswrite): """Return list of I/O blocks to read/write""" iolist = [] if iswrite: bsize = self.wsize bdev = self.wsizedev else: bsize = self.rsize bdev = self.rsizedev tsize = 0 offset = 0 while tsize < size: block = {} if self.direct: # Direct I/O uses same block size for all blocks blocksize = bsize else: # Buffered I/O uses different block sizes blocksize = int(abs(self.random.gauss(bsize, bdev))) if tsize + blocksize > size: # Use remaining bytes for last block blocksize = size - tsize iolist.append({'offset':offset, 'write':iswrite, 'size':blocksize}) offset += blocksize tsize += blocksize return iolist def _mem_alloc(self, size, aligned=False): """Allocate memory for use in C library functions""" dbuffer = None if aligned: # Allocate aligned buffer dbuffer = ctypes.c_void_p() self.libc.posix_memalign(ctypes.byref(dbuffer), self.PAGESIZE, size) else: # Allocate regular buffer dbuffer = self.libc.malloc(size) # Add allocated buffer so it can be freed self.fbuffers.append(dbuffer) return dbuffer def _getlock(self, name, fd, lock_type=None, offset=0, length=0, lock=None, tlock=False): """Get byte range lock on file given by file descriptor""" n = self.random.randint(0,99) stype = fcntl.F_SETLK if lock_type == fcntl.F_UNLCK: lstr = "UNLOCK" if not lock or n >= self.unlock: # Do not unlock file return self.nunlock += 1 else: if tlock: # Just do TLOCK lstr = "TLOCK " stype = fcntl.F_GETLK if n >= self.tlock: # No lock, so no tlock return self.ntlock += 1 else: lstr = "LOCK " if n >= self.lock: # No lock return self.nlock += 1 if lock_type is None: # Choose lock: read or write if self._percent(50): lock_type = fcntl.F_RDLCK else: lock_type = fcntl.F_WRLCK if not tlock: # LOCK is requested, but do TLOCK before actual lock self._getlock(name, fd, lock_type=lock_type, offset=offset, length=length, lock=lock, tlock=True) fstr = "" if offset == 0 and length == 0 and lstr == "LOCK ": fstr = " full file" self._dprint("DBG4", "%s %s %d @ %d (%s)%s" % (lstr, name, length, offset, LOCKMAP[lock_type], fstr)) lockdata = struct.pack('hhllhh', lock_type, 0, offset, length, 0, 0) return fcntl.fcntl(fd, stype, lockdata) def _do_io(self, **kwargs): """Read or write to the given file descriptor""" fd = kwargs.pop("fd", None) write = kwargs.pop("write", False) offset = kwargs.pop("offset", 0) size = kwargs.pop("size", 0) fileobj = kwargs.pop("fileobj", None) lockfull = kwargs.pop("lockfull", True) lockout = None if self.iodelay > 0.0: time.sleep(self.iodelay) # Set file offset to read/write os.lseek(fd, offset, os.SEEK_SET) if write: if self.random and not lockfull: # Lock file segment lockout = self._getlock(fileobj.name, fd, lock_type=fcntl.F_WRLCK, offset=offset, length=size) data = 'x' * size self._dprint("DBG5", "WRITE %s %d @ %d" % (fileobj.name, size, offset)) if self.direct: # Direct I/O -- use native write function count = self.libc.write(fd, self.wbuffer, size) else: # Buffered I/O count = os.write(fd, data) if self._percent(self.fsync): self._dprint("DBG4", "FSYNC %s" % fileobj.name) self.nfsync += 1 os.fsync(fd) self.nwrite += 1 self.wbytes += count fsize = offset + count if fileobj.size < fsize: fileobj.size = fsize else: if self.random and not lockfull: # Lock file segment lockout = self._getlock(fileobj.name, fd, lock_type=fcntl.F_RDLCK, offset=offset, length=size) self._dprint("DBG5", "READ %s %d @ %d" % (fileobj.name, size, offset)) if self.direct: # Direct I/O -- use native read function count = self.libc.read(fd, self.rbuffer, size) else: # Buffered I/O data = os.read(fd, size) count = len(data) self.rbytes += count self.nread += 1 if self.random and not lockfull: # Unlock file segment self._getlock(fileobj.name, fd, lock_type=fcntl.F_UNLCK, offset=offset, length=size, lock=lockout) return count def _do_file(self): """Operate on a file, create, read, truncate, etc.""" self.absfile = "" # Number of files available nlen = len(self.n_files) if self.up_minfiles == 0 and nlen > self.minfiles: self.minfiles = self.bot_minfiles self.up_minfiles = 1 if self.up_minfiles > 0 and nlen < self.minfiles: self.minfiles = self.top_minfiles self.up_minfiles = 0 if nlen > self.minfiles and self._percent(self.trunc): # Truncate file using the file name fileobj = self._get_fileobj() self.absfile = os.path.join(self.datadir, fileobj.name) # Choose new size at random nsize = self.random.randint(0, fileobj.size + self.wsizedev) self._dprint("DBG2", "TRUNC %s %d -> %d" % (fileobj.name, fileobj.size, nsize)) out = self.libc.truncate(self.absfile, nsize) if out == -1: err = ctypes.get_errno() if hasattr(fileobj, 'srcname') and err == errno.ENOENT: # Make sure not to fail if it is a broken symbolic link self._dprint("DBG2", "TRUNC %s: broken symbolic link" % fileobj.name) return raise OSError(err, os.strerror(err), fileobj.name) else: self.ntrunc += 1 fileobj.size = nsize return if nlen > self.minfiles and self._percent(self.rename): # Rename file fileobj = self._get_fileobj() name = self._newname() self.absfile = os.path.join(self.datadir, fileobj.name) newfile = os.path.join(self.datadir, name) self._dprint("DBG2", "RENAME %s -> %s" % (fileobj.name, name)) os.rename(self.absfile, newfile) self.nrename += 1 fileobj.name = name return if nlen > self.minfiles and self._percent(self.remove): # Remove file fileobj = self._get_fileobj() self.absfile = os.path.join(self.datadir, fileobj.name) self._dprint("DBG2", "REMOVE %s" % fileobj.name) os.unlink(self.absfile) self.nremove += 1 self.n_files.pop(self.findex) return if nlen > self.minfiles and self._percent(self.link): # Create hard link name = self._newname() self.absfile = os.path.join(self.datadir, name) index = 0 while True: index += 1 fileobj = self._get_fileobj() if not hasattr(fileobj, 'srcname'): # This file is not a symbolic link, use it break if index >= 10: self.absfile = os.path.join(self.datadir, fileobj.name) raise Exception("Unable to find a valid source file for hard link") srcfile = os.path.join(self.datadir, fileobj.name) self._dprint("DBG2", "LINK %s -> %s" % (name, fileobj.name)) os.link(srcfile, self.absfile) self.nlink += 1 linkobj = FileObj(name=name, size=fileobj.size) self.n_files.append(linkobj) return if nlen > self.minfiles and self._percent(self.slink): # Create symbolic link name = self._newname() self.absfile = os.path.join(self.datadir, name) index = 0 while True: index += 1 fileobj = self._get_fileobj() if not hasattr(fileobj, 'srcname'): # This file is not a symbolic link, use it break if index >= 10: self.absfile = os.path.join(self.datadir, fileobj.name) raise Exception("Unable to find a valid source file for symbolic link") self._dprint("DBG2", "SLINK %s -> %s" % (name, fileobj.name)) os.symlink(fileobj.name, self.absfile) self.nslink += 1 slinkobj = FileObj(name=name, size=fileobj.size, srcname=fileobj.name) self.n_files.append(slinkobj) return if nlen > self.minfiles and self._percent(self.readdir): # Read directory count = self.random.randint(1,99) self._dprint("DBG2", "READDIR %s maxentries: %d" % (self.datadir, count)) self.absfile = self.datadir fd = self.libc.opendir(self.datadir) index = 0 while True: dirent = self.libc.readdir(fd) if dirent == 0 or index >= count: break index += 1 out = self.libc.closedir(fd) self.nreaddir += 1 return # Select type of open: read, write or rdwr total = self.read + self.write rn = self.random.randint(0,99) if rn < self.read: oflags = os.O_RDONLY oflist = ["O_RDONLY"] elif rn < total: oflags = os.O_WRONLY oflist = ["O_WRONLY"] else: oflags = os.O_RDWR oflist = ["O_RDWR"] # Set create file flag if nlen < self.minfiles: # Create at least self.minfiles before any other operation cflag = True else: cflag = self._percent(self.create) if cflag: # Create new name name = self._newname() fileobj = FileObj(name=name, size=0) self.n_files.append(fileobj) if oflags == os.O_RDONLY: # Creating file, must be able to write oflags = os.O_WRONLY oflist = ["O_WRONLY"] oflags |= os.O_CREAT oflist.append("O_CREAT") else: # Use name chosen at random fileobj = self._get_fileobj() if "O_RDONLY" not in oflist and self._percent(self.osync): # Add O_SYNC flag when opening file for writing oflags |= os.O_SYNC oflist.append("O_SYNC") self.nosync += 1 if self.direct: # Open file for direct I/O oflags |= os.O_DIRECT oflist.append("O_DIRECT") # Select random or sequential I/O sstr = "sequen" if self._percent(self.randio): sstr = "random" ostr = "|".join(oflist) fd = None index = 0 is_symlink = False while fd is None: try: index += 1 if hasattr(fileobj, 'srcname'): is_symlink = True self.absfile = os.path.join(self.datadir, fileobj.name) self._dprint("DBG2", "OPEN %s %s %s" % (fileobj.name, sstr, ostr)) fd = os.open(self.absfile, oflags) st = os.fstat(fd) if is_symlink: self._dprint("DBG6", "OPEN %s inode:%d symlink" % (fileobj.name, st.st_ino)) absfile = os.path.join(self.datadir, fileobj.srcname) st = os.stat(absfile) self._dprint("DBG6", "OPEN %s inode:%d src:%s" % (fileobj.name, st.st_ino, fileobj.srcname)) else: self._dprint("DBG6", "OPEN %s inode:%d" % (fileobj.name, st.st_ino)) except OSError as openerr: if is_symlink and openerr.errno == errno.ENOENT: self._dprint("DBG2", "OPEN %s: broken symbolic link" % fileobj.name) if index >= 10: # Do not exit execution, just return to select another operation return # Choose a new name at random fileobj = self._get_fileobj() is_symlink = False else: # Unknown error raise self.nopen += 1 # Get file size for writing size = int(abs(self.random.gauss(self.fsizeavg, self.fsizedev))) odgrade = False if oflags & os.O_WRONLY == os.O_WRONLY: lock_type = fcntl.F_WRLCK iolist = self._getiolist(size, True) elif oflags & os.O_RDWR == os.O_RDWR: lock_type = None iolist = self._getiolist(size, True) iolist += self._getiolist(size, False) if self._percent(self.odgrade): odgrade = True else: lock_type = fcntl.F_RDLCK size = fileobj.size if size == 0: # File does not have any data, at least try to read one block size = self.rsize iolist = self._getiolist(size, False) if sstr == "random": # Shuffle I/O list for random access self.random.shuffle(iolist) # Lock full file if necessary lockfull = False if self._percent(self.lockfull): lockfull = True lockfout = self._getlock(fileobj.name, fd, lock_type=lock_type, offset=0, length=0) if nlen > self.minfiles and "O_RDONLY" not in oflist and self._percent(self.ftrunc): # Truncate file using the file descriptor # Choose new size at random nsize = self.random.randint(0, fileobj.size + self.wsizedev) self._dprint("DBG2", "FTRUNC %s %d -> %d" % (fileobj.name, fileobj.size, nsize)) os.ftruncate(fd, nsize) self.nftrunc += 1 fileobj.size = nsize # Read or write the file for item in iolist: if self.runtime > 0 and time.time() >= self.s_time + self.runtime: # Runtime has been reached break self._do_io(**dict(fd=fd, fileobj=fileobj, lockfull=lockfull, **item)) if lockfull: # Unlock full file self._getlock(fileobj.name, fd, lock_type=fcntl.F_UNLCK, offset=0, length=0, lock=lockfout) fdr = None fdroffset = 0 if odgrade: # Need for open downgrade: # First, the file has been opened for read and write # Second, open file again for reading # Then close read and write file descriptor self._dprint("DBG2", "OPENDGR %s" % fileobj.name) fdr = os.open(self.absfile, os.O_RDONLY) self.nopendgr += 1 count = self._do_io(fd=fdr, offset=fdroffset, size=self.rsize, fileobj=fileobj) fdroffset += count # Close main file descriptor self._dprint("DBG3", "CLOSE %s" % fileobj.name) os.close(fd) self.nclose += 1 if odgrade: for i in xrange(10): count = self._do_io(fd=fdr, offset=fdroffset, size=self.rsize, fileobj=fileobj) fdroffset += count self._dprint("DBG3", "CLOSE %s" % fileobj.name) os.close(fdr) self.nclose += 1 return def run_process(self, tid=0): """Main loop for each process""" ret = 0 stime = time.time() self.tid = tid self.n_index = 1 self.n_files = [] self.s_time = stime # Setup signal handler to gracefully terminate process signal.signal(signal.SIGTERM, stop_handler) # Set file base name according to the number processes self.bidx = 1 + max(2, len("{0:x}".format(max(0,self.nprocs-1)))) self.basename = "f{0:0{width}X}".format(self.tid, width=self.bidx-1) if self.createlogs: # Open a log file for each process if self.nprocs <= 10: self.logfile = self.logbase + "_%d.log" % self.tid elif self.nprocs <= 100: self.logfile = self.logbase + "_%02d.log" % self.tid elif self.nprocs <= 1000: self.logfile = self.logbase + "_%03d.log" % self.tid else: self.logfile = self.logbase + "_%04d.log" % self.tid self.open_log(self.logfile) # Read top level directory and populate file database when # a previous instance was ran on the same top level directory self._get_tree() # Create random object and initialized seed for process self.random = Random() self.random.seed(self.seed + tid) if self.direct: # Round up to nearest PAGESIZE boundary rsize = self.rsize + (self.PAGESIZE - self.rsize)%self.PAGESIZE wsize = self.wsize + (self.PAGESIZE - self.wsize)%self.PAGESIZE self._dprint("DBG7", "Allocating aligned read buffer of size %d" % rsize) self.rbuffer = self._mem_alloc(rsize, aligned=True) self._dprint("DBG7", "Allocating aligned write buffer of size %d" % wsize) self.wbuffer = self._mem_alloc(wsize, aligned=True) pdata = ctypes.create_string_buffer('x' * wsize) self.libc.memcpy(self.wbuffer, pdata, wsize); count = 0 while True: try: self._do_file() except TermSignal: # SIGTERM has been raised, so stop running and send stats break except Exception: errstr = "ERROR on file object %s (process #%d)\n" % (self.absfile, self.tid) errstr += "Directory i-node: %d\n" % self.datadir_st.st_ino ioerror = traceback.format_exc() self._dprint("INFO", errstr+ioerror) ret = 1 break ctime = time.time() if self.runtime > 0 and ctime >= stime + self.runtime: # Runtime has been reached break count += 1 if self.queue: # Send all counts to main process self.queue.put(["RBYTES", self.rbytes]) self.queue.put(["WBYTES", self.wbytes]) self.queue.put(["NOPEN", self.nopen]) self.queue.put(["NOPENDGR", self.nopendgr]) self.queue.put(["NOSYNC", self.nosync]) self.queue.put(["NCLOSE", self.nclose]) self.queue.put(["NREAD", self.nread]) self.queue.put(["NWRITE", self.nwrite]) self.queue.put(["NFSYNC", self.nfsync]) self.queue.put(["NRENAME", self.nrename]) self.queue.put(["NREMOVE", self.nremove]) self.queue.put(["NTRUNC", self.ntrunc]) self.queue.put(["NFTRUNC", self.nftrunc]) self.queue.put(["NLINK", self.nlink]) self.queue.put(["NSLINK", self.nslink]) self.queue.put(["NREADDIR", self.nreaddir]) self.queue.put(["NLOCK", self.nlock]) self.queue.put(["NTLOCK", self.ntlock]) self.queue.put(["NUNLOCK", self.nunlock]) self.queue.put(["RETVALUE", ret]) if self.direct: self._dprint("DBG7", "Free data buffers") for dbuffer in self.fbuffers: self.libc.free(dbuffer) self.close_log() return ret def run(self): """Main function where all processes are started""" errors = 0 if self.seed is None: # Create random seed self.seed = int(1000.0*time.time()) # Main seed so run can be reproduced self.dprint("INFO", "SEED = %d" % self.seed) # Flush log file descriptor to make sure above info is not written # to all log files when using multiple logs for each subprocess self.flush_log() stime = time.time() if not os.path.exists(self.datadir): # Create top level directory if it does not exist os.mkdir(self.datadir, 0777) self.datadir_st = os.stat(self.datadir) if self.nprocs > 1: # setup interprocess queue self.queue = JoinableQueue() processes = [] for i in xrange(self.nprocs): # Run each subprocess with its own process id (tid) # The process id is used to set the random number generator # and also to have each process work with different files process = Process(target=self.run_process, kwargs={'tid':self.tid}) processes.append(process) process.start() self.tid += 1 done = False while not done: # Wait for a short time so main process does not hog the CPU # by checking the queue continuously time.sleep(0.1) while not self.queue.empty(): # Get any pending messages from any of the processes level, msg = self.queue.get() # Check if message is a valid count first if level == "RBYTES": self.rbytes += msg elif level == "WBYTES": self.wbytes += msg elif level == "NOPEN": self.nopen += msg elif level == "NOPENDGR": self.nopendgr += msg elif level == "NOSYNC": self.nosync += msg elif level == "NCLOSE": self.nclose += msg elif level == "NREAD": self.nread += msg elif level == "NWRITE": self.nwrite += msg elif level == "NFSYNC": self.nfsync += msg elif level == "NRENAME": self.nrename += msg elif level == "NREMOVE": self.nremove += msg elif level == "NTRUNC": self.ntrunc += msg elif level == "NFTRUNC": self.nftrunc += msg elif level == "NLINK": self.nlink += msg elif level == "NSLINK": self.nslink += msg elif level == "NREADDIR": self.nreaddir += msg elif level == "NLOCK": self.nlock += msg elif level == "NTLOCK": self.ntlock += msg elif level == "NUNLOCK": self.nunlock += msg elif level == "RETVALUE": if msg != 0: errors += 1 if self.exiterr: # Exit on first error for process in list(processes): process.terminate() break else: # Message is not any of the valid counts, # so treat it as a debug message self.dprint(level, msg) # Check if any process has finished for process in list(processes): if not process.is_alive(): process.join() if not self.exiterr and abs(process.exitcode): errors += 1 processes.remove(process) if len(processes) == 0: done = True break else: # Only one process to run, just run the function out = self.run_process(tid=self.tid) if out != 0: errors += 1 # Set seed to make sure if this function is called again a different # set of operations will be called self.seed += self.nprocs delta = time.time() - stime # Display stats self.dprint("INFO", "==================STATS===================") self.dprint("INFO", "OPEN: % 7d" % self.nopen) self.dprint("INFO", "OPENDGR: % 7d" % self.nopendgr) self.dprint("INFO", "CLOSE: % 7d" % self.nclose) self.dprint("INFO", "OSYNC: % 7d" % self.nosync) self.dprint("INFO", "READ: % 7d, % 10s, % 10s/s" % (self.nread, convert_uint(self.rbytes), convert_uint(self.rbytes/delta))) self.dprint("INFO", "WRITE: % 7d, % 10s, % 10s/s" % (self.nwrite, convert_uint(self.wbytes), convert_uint(self.wbytes/delta))) self.dprint("INFO", "FSYNC: % 7d" % self.nfsync) self.dprint("INFO", "RENAME: % 7d" % self.nrename) self.dprint("INFO", "REMOVE: % 7d" % self.nremove) self.dprint("INFO", "TRUNC: % 7d" % self.ntrunc) self.dprint("INFO", "FTRUNC: % 7d" % self.nftrunc) self.dprint("INFO", "LINK: % 7d" % self.nlink) self.dprint("INFO", "SLINK: % 7d" % self.nslink) self.dprint("INFO", "READDIR: % 7d" % self.nreaddir) self.dprint("INFO", "LOCK: % 7d" % self.nlock) self.dprint("INFO", "TLOCK: % 7d" % self.ntlock) self.dprint("INFO", "UNLOCK: % 7d" % self.nunlock) if errors > 0: self.dprint("INFO", "ERRORS: % 7d" % errors) self.dprint("INFO", "TIME: % 7d secs" % delta)
tasks = [Task(q, out_queue) for i in range(NUM_WORKERS)] for w in tasks: w.start() logging.info("Items left in queue: {0}".format(q.qsize())) logging.debug("Joining q") # q.join() # qf.join() if False: processes_active = True while processes_active: for w in tasks: processes_active = False or w.is_alive() logging.debug(w.is_alive()) sleep(0.2) for y in tasks: y.join() logging.info("Elapsed time with {0} threads and {1} as maximum number: {2}".format(NUM_WORKERS, MAX_PRIME_NUMBER, datetime.now()-start_time)) count = 0 while not out_queue.empty(): out_queue.get() out_queue.task_done() count += 1 logging.info("Total primes found: {0}".format(count))
def update_data(args, logger): start_date = datetime.datetime.strptime(args.start_date, '%Y-%m-%d') end_date = datetime.datetime.strptime(args.end_date, '%Y-%m-%d') if start_date > end_date: logger.warning("start_date[%s] greater than end_date[%s]" % (args.start_date, args.end_date)) return True task_queue = JoinableQueue() result_queue = JoinableQueue() finished_date_list = [] cur_date = end_date cnt = 0 while cur_date >= start_date: cur_date_str = cur_date.strftime("%Y-%m-%d") task_queue.put(cur_date_str) cur_date = cur_date - datetime.timedelta(days=1) cnt += 1 logger.info("run task in [%s] days" % cnt) process_list = [] for i in range(args.parallel): process = Process(target=update_data_each_day, args=(task_queue, result_queue, i, logger)) process_list.append(process) for process in process_list: process.daemon = True process.start() logger.info("run task in main process") success_dates = [] failed_dates = [] while 1: if len(finished_date_list) >= cnt: logger.info("finish all task with finished_data_list_len[%s], cnt[%s]" % (len(finished_date_list), cnt)) break already_finish_flag = True if not result_queue.empty(): finished_date_info_str = result_queue.get() finished_date_info = json.loads(finished_date_info_str) finished_date = finished_date_info["date"] finished_data_status = finished_date_info["status"] logger.info("finished_date[%s] get from result_queue" % finished_date) if not int(finished_date_info["already_update"]): already_finish_flag = False if finished_data_status == "fail": logger.warning("finished_date[%s] generate_pb_data failed, already_update_flag[%s]" % ( finished_date, already_finish_flag)) failed_dates.append(finished_date) notice(finished_date, 0, "generate_pb_data_failed", already_finish_flag, logger) else: if finished_date in finished_date_list: logger.error("finished_date[%s] already in finished_date_list, already_update_flag[%s]" % ( finished_date, already_finish_flag)) failed_dates.append(finished_date) notice(finished_date, 0, "repeated_date_generation", already_finish_flag, logger) else: logger.info("finished_date[%s] normal case add to finished_date_list" % finished_date) if delete_data(finished_date, args.delay_num, logger) < 0: logger.error("finished_date[%s] delete old_data failed, already_update_flag[%s]" % ( finished_date, already_finish_flag)) failed_dates.append(finished_date) notice(finished_date, 0, "delete_old_data_failed", already_finish_flag, logger) else: logger.info("finished_date[%s] update_success, already_update_flag[%s]" % ( finished_date, already_finish_flag)) success_dates.append(finished_date) notice(finished_date, 1, "update_pb_success", already_finish_flag, logger) latest_success_delay_date = get_latest_success_delay_date(success_dates, args.delay_num, logger) if latest_success_delay_date != "-1": notice_latest(latest_success_delay_date, already_finish_flag, logger) finished_date_list.append(finished_date) with open("success_pb_dates", "w") as fp1: json.dump(success_dates, fp1) with open("failed_pb_dates", "w") as fp1: json.dump(failed_dates, fp1) time.sleep(2) logger.info("stop subprocess tasks") for _ in process_list: task_queue.put(None) task_queue.join() logger.info("run task in main process finish")
histogram_merge_worker.start() if args.top: reader_procs = [ psutil.Process(reader.pid) for reader in readers ] worker_procs = [ psutil.Process(worker.pid) for worker in workers ] pair_buffer={} scaffold_count={} # while (not inq.empty()) or sum( [reader.is_alive() for reader in readers] )>0: while True: if args.debug: print("get") try: procid,scaffold,pairs = inq.get() # procid,scaffold,pairs = inq.get(True,10) #print("#got data:",procid,scaffold,len(pairs)) print("#got data from inq:",procid,scaffold,len(pairs),inq.empty(),inq.qsize(),inq.full(),strftime("%Y-%m-%d %H:%M:%S"),sum( [reader.is_alive() for reader in readers] ),"q.size():",q.qsize(),file=sys.stderr,sep="\t") sys.stderr.flush() sys.stdout.flush() except Exception as e: print(e,file=sys.stderr) if args.top: print("queue get timed out",[reader.cpu_percent() for reader in reader_procs],[worker.cpu_percent() for worker in worker_procs]) #print("#timed out",inq.empty()) print("#read from queue timed out:",inq.empty(),inq.qsize(),inq.full(),strftime("%Y-%m-%d %H:%M:%S"),sum( [reader.is_alive() for reader in readers] ),file=sys.stderr,sep="\t") sys.stderr.flush() continue if args.debug: print("got") if not scaffold in pair_buffer: pair_buffer[scaffold]=[] pair_buffer[scaffold] += pairs scaffold_count[scaffold] = scaffold_count.get(scaffold,0)+1