class Thread_Pool_Manager(object): def __init__(self, thread_num=cpu_count()): self.thread_num = thread_num print(thread_num) self.work_queue = JoinableQueue() self.work_num = Semaphore(0) self.mutex = Lock() def start_threads(self): for i in range(self.thread_num): thread = Process(target=self.do_job) thread.daemon = True # set thread as daemon thread.start() def do_job(self): global Numbers while True: # print(1) self.work_num.acquire() with self.mutex: print(1, self.work_queue.qsize()) thread_job = self.work_queue.get() print(0, self.work_queue.qsize()) thread_job.do_job(self.work_queue, self.work_num) print(self.work_queue.qsize()) self.work_queue.task_done() def join(self): self.work_queue.join() def add_job(self, job): self.work_queue.put(job) self.work_num.release()
def next_train_parallel(self, img_len, n_workers): # depricated # create multiprocessing overhead from multiprocessing import JoinableQueue, Process q_send, q_recive = JoinableQueue(),JoinableQueue() # send some jobs to queue workers = [Process(target=get_batch_parallel, args=(q_send, q_recive,self.n_channels,self.img_w,self.img_h,img_len, self.train_imgs, self.train_real_text, self.test_imgs, self.test_real_text, self.is_nrc, self.absolute_max_string_len,)) for n in range(n_workers)] # start all processes [i.start() for i in workers] while True: # check if need to send data if q_recive.empty() or q_send.qsize() < n_workers: for _ in range(n_workers - q_send.qsize()): q_send.put([self.cur_test_index, self.minibatch_size, False]) if self.cur_test_index + self.minibatch_size > len(self.test_imgs): self.cur_test_index = 0 # reshuffle does inplace reshuffle_index = np.arange(len(self.test_imgs)) np.random.shuffle(reshuffle_index) self.test_imgs = self.test_imgs[reshuffle_index].ravel() self.test_real_text = self.test_real_text[reshuffle_index].ravel() ret = q_recive.get() yield ret
def load_urls(self, **kwargs): urls = ["https://www.atagar.com/echo.php"] * 100 urls_queue = Queue() for url in urls: filename = os.path.join(self.results_directory, "doc_%03d.txt" % (urls_queue.qsize() + 1)) urls_queue.put((url, filename)) self.urls_queue = urls_queue self.num_urls = int(urls_queue.qsize())
def queueManager(numProc, myList, function, *args): '''queueManager(numProc, myList, function, *args): generic function used to start worker processes via the multiprocessing Queue object numProc - number of processors to use myList - a list of objects to be iterated over function - target function *args - additional arguments to pass to function Return - an unordered list of the results from myList ''' qIn = Queue() qOut = JoinableQueue() if args: arguments = (qIn, qOut,) + args else: arguments = (qIn, qOut,) results = [] # reduce processer count if proc count > files i = 0 for l in myList: qIn.put((i,l)) i += 1 for _ in range(numProc): p = Process(target = function, args = arguments).start() sys.stdout.write("Progress: {:>3}%".format(0) ) curProgress = 0 lastProgress = 0 while qOut.qsize() < len(myList): #sys.stdout.write("\b\b\b\b{:>3}%".format(int(ceil(100*qOut.qsize()/len(myList))))) curProgress = int(ceil(100*qOut.qsize()/len(myList))) if curProgress - lastProgress > 10: lastProgress += 10 sys.stdout.write("\nProgress: {:>3}%".format(lastProgress)) sys.stdout.flush() sys.stdout.write("\nProgress: {:>3}%".format(100)) #sys.stdout.write("\b\b\b\b{:>3}%".format(100)) sys.stdout.write("\n") for _ in range(len(myList)): # indicate done results processing results.append(qOut.get()) qOut.task_done() #tell child processes to stop for _ in range(numProc): qIn.put('STOP') orderedRes = [None]*len(results) for i, res in results: orderedRes[i] = res qOut.join() qIn.close() qOut.close() return orderedRes
def curate(db): art_queue = JoinableQueue() feed_queue = JoinableQueue() for feedurl in abo_urls(db["subscriptions"]): feed_queue.put(feedurl) print "Downloading %i feeds.." % feed_queue.qsize() run_processes(__feed_worker, feed_queue, art_queue) print "Downloading %i articles.." % art_queue.qsize() run_processes(__art_worker, art_queue) print "Done."
def test_basic(): in_queue = JoinableQueue() algolia_reader = Algoliaio("MyAppID", "MyKey", 1000) algolia_reader.scan_and_queue(in_queue, p_index="INT_Rubriques",p_query=None, p_connect_timeout=30, p_read_timeout=60) assert in_queue.qsize() > 2600
def test_basic(): in_queue = JoinableQueue() mongo_reader = Mongoio(p_host='localhost',p_port='27017',p_user='******',p_password='******',p_base='ACTIVITE',p_rs_xtra_nodes=['localhost:27018','localhost:27019'],p_rs_name='rs0') mongo_reader.scan_and_queue(in_queue,p_collection='rubriques', p_query={}) assert in_queue.qsize() > 2600
class TaskManager: # noinspection PyPep8Naming def __init__(self, jobs_queue_capacity: int, workers_num: int, WorkerClass: Worker.__class__ = Worker): # empty job queue self._queue = JoinableQueue(maxsize=jobs_queue_capacity) logger.info( f'Queue size set to accept at most {jobs_queue_capacity} before pausing job assignment.' ) self.WorkerClass = WorkerClass self.workers_num = max_number_of_workers(workers_num) _workers = [] def wake_up_workers(self): self._workers: List[Worker] = [ self.WorkerClass(self._queue) for _ in range(self.workers_num) ] for worker in self._workers: worker.start() def assign_task(self, job: Task): self._queue.put(job) def stop_workers(self): logger.info('waiting all workers to finish') # usual termination condition is to put None on the queue. Queues are FIFO but from Python 3.8 docs: # https://docs.python.org/3.8/library/multiprocessing.html#pipes-and-queues # "If multiple processes are enqueuing objects, it is possible for the objects to be received at the other # end out-of-order. However, objects enqueued by the same process will always be in the expected order # with respect to each other.". So, when there's a single producer, that's not an issue; when there are many # producers it may happen that even if Nones are enqueued at the end of the queue, consumers pick 'em # before other items in the queue (breaking the FIFO assumption). In this case the workers would leave # before the queue is empty. To avid this, before sending Nones, it's better to wait for the queue to be # consumed. while not self._queue.empty( ): # not bullet-proof as empty() and qsize() return approx. values, but it helps print(f"jobs waiting to be assigned: {self._queue.qsize()}") sleep(1) for _ in self._workers: self._queue.put(None, block=True, timeout=None) self._queue.join() logger.info('all processes_finished') def discard_waiting_tasks(self): while not self._queue.empty(): try: self._queue.get(False) except Empty: continue self._queue.task_done() def number_of_waiting_tasks(self): return self._queue.qsize()
def apply_mt(self, xs, parallelism, **kwargs): """Run the UDF multi-threaded using python multiprocessing""" if snorkel_conn_string.startswith('sqlite'): raise ValueError('Multiprocessing with SQLite is not supported. Please use a different database backend,' ' such as PostgreSQL.') # Fill a JoinableQueue with input objects in_queue = JoinableQueue() for x in xs: in_queue.put(x) # If the UDF has a reduce step, we collect the output of apply in a # Queue. This is also used to track progress via the the UDF sentinel out_queue = JoinableQueue() # Keep track of progress counts total_count = in_queue.qsize() count = 0 # Start UDF Processes for i in range(parallelism): udf = self.udf_class(in_queue=in_queue, out_queue=out_queue, add_to_session=(self.reducer is None), **self.udf_init_kwargs) udf.apply_kwargs = kwargs self.udfs.append(udf) # Start the UDF processes, and then join on their completion for udf in self.udfs: udf.start() while any([udf.is_alive() for udf in self.udfs]) and count < total_count: y = out_queue.get() # Update progress whenever an item was processed if y == UDF.TASK_DONE_SENTINEL: count += 1 if self.pb is not None: self.pb.update(1) # If there is a reduce step, do now on this thread elif self.reducer is not None: self.reducer.reduce(y, **kwargs) out_queue.task_done() else: raise ValueError("Got non-sentinel output without reducer.") if self.reducer is None: for udf in self.udfs: udf.join() else: self.reducer.session.commit() self.reducer.session.close() # Flush the processes self.udfs = []
def apply_mt(self, xs, parallelism, **kwargs): """Run the UDF multi-threaded using python multiprocessing""" if Asterisk_conn_string.startswith('sqlite'): raise ValueError('Multiprocessing with SQLite is not supported. Please use a different database backend,' ' such as PostgreSQL.') # Fill a JoinableQueue with input objects in_queue = JoinableQueue() for x in xs: in_queue.put(x) # If the UDF has a reduce step, we collect the output of apply in a # Queue. This is also used to track progress via the the UDF sentinel out_queue = JoinableQueue() # Keep track of progress counts total_count = in_queue.qsize() count = 0 # Start UDF Processes for i in range(parallelism): udf = self.udf_class(in_queue=in_queue, out_queue=out_queue, add_to_session=(self.reducer is None), **self.udf_init_kwargs) udf.apply_kwargs = kwargs self.udfs.append(udf) # Start the UDF processes, and then join on their completion for udf in self.udfs: udf.start() while any([udf.is_alive() for udf in self.udfs]) and count < total_count: y = out_queue.get() # Update progress whenever an item was processed if y == UDF.TASK_DONE_SENTINEL: count += 1 if self.pb is not None: self.pb.update(1) # If there is a reduce step, do now on this thread elif self.reducer is not None: self.reducer.reduce(y, **kwargs) out_queue.task_done() else: raise ValueError("Got non-sentinel output without reducer.") if self.reducer is None: for udf in self.udfs: udf.join() else: self.reducer.session.commit() self.reducer.session.close() # Flush the processes self.udfs = []
def test_basic(): in_queue = JoinableQueue() algolia_reader = Algoliaio("MyAppID", "MyKey", 1000) algolia_reader.scan_and_queue(in_queue, p_index="INT_Rubriques", p_query=None, p_connect_timeout=30, p_read_timeout=60) assert in_queue.qsize() > 2600
def start_multiprocessing(jsonfilename): work = JoinableQueue() STOP_TOKEN = "STOP!" filenames = [] # start for workers procs = [] for i in range(int(cpu_count() - 8)): # slice .gz and unique identifier outfile = "%s_%d.txt" % (jsonfilename[5:-3], i) filenames.append(outfile) # reset output file open(outfile, "w").close() t = Process(target=do_work, args=(work, outfile, STOP_TOKEN)) t.daemon = True t.start() procs.append(t) # produce data with gzip.GzipFile(jsonfilename, "r") as fin: for line in fin: work.put(line) if work.qsize() > 300000: # try to avoid too much memory usage print("sleeping for 20 seconds to relax memory usage...") time.sleep(20) print("back to work! current queue size: %d" % (work.qsize())) work.put(STOP_TOKEN) print("Waiting to join processes...") for p in procs: p.join() print("collect output of multiple jobs into a single file??") with open("full_%s.out" % (jsonfilename[5:-3]), 'w') as outfile: for fname in filenames: with open(fname) as infile: for line in infile: outfile.write(line)
def apply_mt(self, xs, parallelism, **kwargs): """Run the UDF multi-threaded using python multiprocessing""" if not _meta.postgres: raise ValueError( "Fonduer must use PostgreSQL as a database backend.") # Fill a JoinableQueue with input objects in_queue = JoinableQueue() for x in xs: in_queue.put(x) # Use an output queue to track multiprocess progress out_queue = JoinableQueue() # Track progress counts total_count = in_queue.qsize() count = 0 # Start UDF Processes for i in range(parallelism): udf = self.udf_class(in_queue=in_queue, out_queue=out_queue, worker_id=i, **self.udf_init_kwargs) udf.apply_kwargs = kwargs self.udfs.append(udf) # Start the UDF processes, and then join on their completion for udf in self.udfs: udf.start() while any([udf.is_alive() for udf in self.udfs]) and count < total_count: y = out_queue.get() # Update progress bar whenever an item is processed if y == UDF.TASK_DONE: count += 1 if self.pb is not None: self.pb.update(1) else: raise ValueError("Got non-sentinal output.") for udf in self.udfs: udf.join() # Terminate and flush the processes for udf in self.udfs: udf.terminate() self.udfs = []
def test_basic(): in_queue = JoinableQueue() mongo_reader = Mongoio( p_host='localhost', p_port='27017', p_user='******', p_password='******', p_base='ACTIVITE', p_rs_xtra_nodes=['localhost:27018', 'localhost:27019'], p_rs_name='rs0') mongo_reader.scan_and_queue(in_queue, p_collection='rubriques', p_query={}) assert in_queue.qsize() > 2600
def test_basic(): in_queue = JoinableQueue() mysql_reader = Mysqlio('localhost','3600','test','root','') mysql_reader.scan_and_queue(in_queue,"SELECT * FROM swallow") assert in_queue.qsize() == 3 res = [] while not in_queue.empty(): res.append(in_queue.get()) expected_res = [{'id':1,'libelle':'test'},{'id':2,'libelle':'john'},{'id':3,'libelle':'woo'}] assert res == expected_res
class TaskControl: def __init__(self, cls_worker, count, *args, **kwargs): self.queue = JoinableQueue() self.stopped = Event() self.count_processed = Value('i', 0) self.processes = [cls_worker(self, *args) for _ in range(count)] map(Process.start, self.processes) def is_active(self): return not self.stopped.is_set() def is_alive(self): alive = filter(bool, map(Process.is_alive, self.processes)) print '---- %d child processes are still alive' % len(alive) return alive def stop(self): self.stopped.set() self.queue.close() print '-- waiting for processes to finish' map(Process.join, self.processes) self.queue.cancel_join_thread() def send_chunk(self, items): map(self.queue.put, items) print '--- waiting for queue to complete' while self.get_stats()[1] and self.is_alive(): time.sleep(1) def get(self): while self.is_active(): try: yield self.queue.get(timeout=1) except Queue.Empty: pass def tick(self): self.queue.task_done() self.count_processed.value += 1 if not self.count_processed.value % 20: print '%d items processed' % self.count_processed.value time.sleep(0.5) def get_stats(self): stats = self.count_processed.value, self.queue.qsize() print '--- %d items processed, %d queued' % stats return stats
def pdf_workers(workers_count=2, debug=True, files=0): if db.is_closed(): db.connect() additional = Job.select().where(Job.status == 1).count() Job.update(status=0).where(Job.status == 1).execute() progress_bar = ProcessBar(additional + files, debug=files != 0) queue = JoinableQueue() db_queue = Queue() workers = Workers(workers_count, queue, db_queue, files == 0) while True: if files != 0 and Job.select().where(Job.status != 2).count() == 0: break progress_bar.update((workers.cnt.value - progress_bar.value)) if not db_queue.empty(): jobs = [] while not db_queue.empty(): jobs.append(db_queue.get()) with db.atomic(): Job.bulk_update(jobs, fields=["status"], batch_size=100) elif workers_count - queue.qsize() > 0: additional_jobs = min( Job.select().where(Job.status == 0).count(), len(workers) - workers.pending.value, ) if additional_jobs == 0: continue jobs = [] for job in ( Job.select() .where(Job.status == 0) .paginate(1, additional_jobs) ): job.status = 1 jobs.append(job) with db.atomic(): Job.bulk_update(jobs, fields=["status"], batch_size=100) for job in jobs: queue.put(job) queue.join() db.close()
def corpus_analyser(corpus_path, process_count): print('Scanning corpora...') file_queue = JoinableQueue() hearst_dict = Manager().dict() word_count = Value('i', 0) for root, subFolders, files in os.walk(corpus_path): for current_file in files: if current_file.endswith(".txt"): file_queue.put(os.path.join(root, current_file)) file_count = file_queue.qsize() print "{0} files found.\n".format(file_count) sys.stdout.write("\r0.00%\tWord count: 0") def worker(process_id): while not file_queue.empty(): current_path = file_queue.get() with open(current_path, 'r') as current_file: data = ' '.join(current_file.read().replace('\n', ' ').split()) data_tokenized = word_tokenize(data) word_count.value += len(data_tokenized) data_tagged = pos_tag(data_tokenized) hearst_patterns(data_tagged, hearst_dict) percentage = 100.0 - ((float(file_queue.qsize()) / float(file_count)) * 100.0) sys.stdout.write("\r{0:.2f}%\tWord count: {1}".format(percentage, word_count.value)) sys.stdout.flush() file_queue.task_done() for pid in range(process_count): process = Process(target=worker, args=(pid,)) process.daemon = True process.start() file_queue.join() print "\n" return hearst_dict
def run(this): Factory.pg("configuration test...") this.test() # check configuration source = this.source fout_name = this.fout_name func = this.func fparam = this.fparam num_workers = this.num_workers worker = this.worker progressor = this.progressor # queue settings Factory.pg("arranging source elements...") from multiprocessing import JoinableQueue,Process in_queue = JoinableQueue() for item in source: in_queue.put(item) # worker progressing progressor = Process(target=progressor, args=(in_queue, in_queue.qsize())) import time start_time = time.time() progressor.start() # worker settings fouts, workers = [], [] for w_id in xrange(num_workers): fouts.append(open("%s_part%d"%(fout_name,w_id),"w")) workers.append(Process(target=worker, args=(w_id, in_queue, func, fparam, fouts[w_id]))) workers[w_id].start() # post processing in_queue.join() for w_id in xrange(num_workers): workers[w_id].terminate() progressor.terminate() end_time = time.time() Factory.pg("working done (%.1fs lapsed)"%(end_time - start_time), br=True) import os os.system("cat %s_part* > %s"%(fout_name,fout_name)) os.system("rm -f %s_part*"%(fout_name))
def worker(queue: JoinableQueue, securitiesDictionary: SecuritiesDict, orderStatistics: OrderStatisticsAggregator): """[The worker that will run in a thread] Args: queue (JoinableQueue): [The queue from which items will be taken] securitiesDictionary (SecuritiesDict): [Contains securities data keyed by securityId] orderStatistics (OrderStatisticsAggregator): [Contains aggregated order data] """ counter: int = 0 notEmpty: bool = True while notEmpty and queue.qsize(): try: jsonStr = queue.get(block=True, timeout=5.0) jsonStr = jsonStr.decode('ASCII') innerProcessor(jsonStr, securitiesDictionary, orderStatistics) queue.task_done() counter += 1 except Empty: notEmpty = False
def test_basic(): in_queue = JoinableQueue() mysql_reader = Mysqlio('localhost', '3600', 'test', 'root', '') mysql_reader.scan_and_queue(in_queue, "SELECT * FROM swallow") assert in_queue.qsize() == 3 res = [] while not in_queue.empty(): res.append(in_queue.get()) expected_res = [{ 'id': 1, 'libelle': 'test' }, { 'id': 2, 'libelle': 'john' }, { 'id': 3, 'libelle': 'woo' }] assert res == expected_res
class ProcessBlock(Process, ABC): """ The abstract class for a block/process in an execution pipeline """ # Arbitrary timeout for blocking queue operations _poll_interval = 1 def __init__(self, *args, parent=None, queue_size=0, **kwargs): super().__init__(*args, **kwargs) # Events (in the order they should be checked) self.events = OrderedDict([ ("cancel", Event()), ("requeue", Event()), ("stop", Event()), ]) # Corresponding event handlers self.event_handlers = { "cancel": self._cancel_handler, "requeue": self._requeue_handler, "stop": self._stop_handler, } # Master event, to be set after any other event self.event = Event() # The family of the processblock siblings = copy(parent.family.children) if parent is not None else [] self.family = BlockFamily(parent, siblings, []) # Link family with self self.family.link(self) # The object queue self.objs = JoinableQueue(queue_size) # List of objects that were canceled and need re-processing self._canceled_objs = deque() # Logging facility self.logger = getLogger(self.name) # Object currently processed self._obj = None def start(self): super().__init__(name=self.name) super().start() @abstractmethod def process_obj(self, obj): """ The actual work a block wants to perform on a object """ raise NotImplementedError() def _stop_handler(self): """ Send the "end object" (None) to every child """ self.logger.debug("sending the 'end object' to child processes...") for _ in self.family.alive_children(): self.objs.put(None) def cancel(self): """ Set the cancel event and the master event """ self.events["cancel"].set() self.event.set() def _cancel_handler(self): """ Cancel children's objects and re-queue them in self._canceled_objs """ self.logger.debug("ask children to requeue their objects") for child in self.family.alive_children(): child.events["requeue"].set() child.event.set() self.logger.debug("fetching canceled objects...") while (self.objs.qsize() != 0 or any(child.events["requeue"].is_set() for child in self.family.alive_children())): try: obj = self.objs.get_nowait() self.objs.task_done() except Empty: continue if obj is not None: self._canceled_objs.append(obj) # To be able to stop without the parent block sending an 'end object' if self.events["stop"].is_set(): self._canceled_objs.append(None) self.events["stop"].clear() # Clear the event self.events["cancel"].clear() def _requeue_handler(self): """ Requeue every object managed by the block or one of its children """ for child in self.family.alive_children(): child.events["requeue"].set() child.event.set() self.logger.debug("requeueing objects...") if self._obj is not None: self.family.parent.objs.put(self._obj) self._obj = None while (self.objs.qsize() != 0 or any(child.events["requeue"].is_set() for child in self.family.alive_children())): try: obj = self.objs.get_nowait() self.objs.task_done() except Empty: # Do not waste that time if self._canceled_objs: obj = self._canceled_objs.popleft() else: continue if obj is not None: self.family.parent.objs.put(obj) for obj in filter(lambda x: x is not None, self._canceled_objs): self.family.parent.objs.put(obj) self.logger.debug("wait for parent to fetch all the objects...") self.family.parent.objs.join() # Processblock was potentially stopped self.events["stop"].clear() # Clear the event self.events["requeue"].clear() def _process_events(self, ignore=()): """ Process events The order in which events are processed is important Returns: True --- if an Event was processed False --- otherwise """ self.logger.debug("process events...") if not self.event.is_set(): return False self.event.clear() event_processed = False for event_name in self.events: if event_name in ignore: continue if self.events[event_name].is_set(): self.logger.debug("processing '%s' event", event_name) self.event_handlers[event_name]() event_processed = True return event_processed def get_obj(self, timeout=None): """ Get an object from the parent block """ self.logger.debug("get an object to process...") try: return self._canceled_objs.popleft() except IndexError: obj = self.family.parent.objs.get(timeout=timeout) self.family.parent.objs.task_done() return obj def try_publish_obj(self, obj, poll_interval=None): """ Publish `obj` to child blocks (unless `obj` is None) Returns: True if `obj` was published False if an event occured before `obj` was published """ if obj is None: return True if not self.family.children: self.logger.debug("no one to pass '%s' onto", obj) return True self.logger.debug("publish '%s'", obj) while not self.event.is_set(): try: self.objs.put(obj, timeout=poll_interval) except Full: continue return True # An event occured self.logger.debug("publication was interrupted by an event") return False def _cleanup(self): """ Tell parent and siblings we stop and exit cleanly """ if self.family.parent is not None: self.family.parent.event.set() for sibling in self.family.siblings: sibling.event.set() self.logger.debug("waiting for child processes...") for child in self.family.children: child.join() def run(self): """ Launch child blocks and process objects """ # Launch child blocks # Children are started here in order to build a gracefull process tree self.logger.debug("start %d child(ren)", len(self.family.children)) for child in self.family.children: child.start() while not self.events["stop"].is_set(): # Processing loop while not self.events["stop"].is_set(): # Process exterior events if self._process_events(): continue # Find an object to process if self._obj is None: try: self._obj = self.get_obj(timeout=self._poll_interval) except Empty: continue if self._obj is None: self.logger.debug("received the 'end object'") self.events["stop"].set() self.event.set() continue obj = self._obj # Process the object self.logger.debug("process '%s'", obj) try: obj = self.process_obj(obj) except ProcessingError as exc: self.logger.warning(exc) continue except EventInterrupt: # An event ocrrured, process it continue # Publish the processed object, check for events periodically if self.try_publish_obj(obj, poll_interval=self._poll_interval): # Object was published, or did not need to be self._obj = None # Process the stop event (which is ignored in the loop underneath) self._process_events() # Wait for the entire family to stop, unless `stop` gets cleared while (self.events["stop"].is_set() and not self.family.is_stopped()): self.event.wait() self._process_events(ignore=("stop",)) # Process is exiting, there is no turning back # Every sibling/child process will shortly do so too (or already have) self._cleanup() self.logger.debug("terminating")
class MPController(object): """ Main MP object which maintains various queue and task objects and info. Launches generic client task processors as independant processes, populates the input queue and manages the writer process. """ def __init__(self, heartbeat=None, numProc=None, chunkSize=1): self.heartbeat = heartbeat # Heartbeat output manager self.tasks = JoinableQueue() # Tasks for processing to be added here self.results = Queue() # Processed results accumulate here self.writerConn = Pipe() # Direct pipe to writer manager self.writer = self.writerConn[0] self.resultDict = {} # Info and orderedStream objects if numProc is None: numProc = max(cpu_count() - 2, 1) self.nproc = numProc self.chunkSize = chunkSize if self.heartbeat: #heartbeat.Lock = Lock() heartbeat.message("Launching %d sub-processes" % self.nproc, True) heartbeat.message( "Tasks will process pairs in chunks of %d" % self.chunkSize, True) else: print "CONTROLLER WILL LAUNCH %d sub-processes" % self.nproc print "Tasks will process pairs in chunks of %d" % self.chunkSize sys.stdout.flush() #self.configureQueueLimits() self.tasksRunning = 0 self.Counter = Value('L', 0) # Shared value: counts processed pairs self.recordsProcessed = 0 tLock = Lock() tValue = Value('L', 0) self.workers = [ TaskProcessor(self.tasks, self.results, tValue, tLock) for i in xrange(self.nproc) ] def configureQueueLimits(self): availGB = AvailableRAM() self.HIGH_WATERMARK = int(WATERMARK_BASE * availGB) self.LOW_WATERMARK = int(round(self.HIGH_WATERMARK * 0.50)) def add(self, task, qhigh, qlow): """Block if Qsize above a certain high watermark in item size, and don't release until it has fallen below the low watermark""" try: qlen = self.tasks.qsize() if qlen > qhigh: print "Throttling input, reached HWM:", qhigh while qlen > qlow: delay = random.randint(1, 10) time.sleep(delay) qlen = self.tasks.qsize() print "Throttling released, down to LWM:", qlow except NotImplementedError: # Skip on Mac OS X (WARNING - use on OS X in testing only, queue # size will max out at a paltry 32768 items) pass try: self.tasks.put(task) self.recordsProcessed += task.datalen except qFull: # While testing: we shouldn't hopefully end up here... print "ERR: queue full" sys.exit(-1) def finishQueue(self): for i in xrange(self.nproc): self.tasks.put(None) if self.heartbeat is not None: self.heartbeat.total = self.recordsProcessed newCount = self.recordsProcessed self.heartbeat.message("Definitive pair count: %d" % newCount, True) def start(self): i = 0 for worker in self.workers: worker.start() i += 1 self.tasksRunning = self.nproc def OneTaskDone(self): self.tasksRunning -= 1 def wait(self): self.tasks.join() if self.heartbeat: self.heartbeat.count = self.recordsProcessed def finishProcesses(self): self.writer.close() def getpids(self): pids = [] for w in self.workers: pids.append(w.pid) return pids def Send(self, obj): self.writer.send(obj) def Recv(self): return self.writer.recv()
class WebDav: NUM_WORKING_PROCESSES = 5 def __init__(self, host, user, passwd, timeout=-999, logger=None): self.fp = dict() webdav_host = host self.webdav_host = webdav_host self.host = host self.user = user self.passwd = passwd self.processes = [] self.file_queue = JoinableQueue(maxsize=0) self.result_queue = Queue(maxsize=0) self.is_alive = { "status": True } options = { 'webdav_hostname': self.webdav_host, 'webdav_login': self.user, 'webdav_password': self.passwd } self.webdavClient = wc.Client(options) self.logger = logger self._tzinfo = TimeZoneMSK() def parent(self, path): return urn.Urn(path).parent() def path(self, path): return urn.Urn(path).path() def generate_file_info(self, file_path): info = self.webdavClient.info(file_path) is_dir = False is_link = False if self.webdavClient.is_dir(file_path): is_dir = True else: pass file_name = urn.Urn(file_path).filename().replace("/", "") file_dir = urn.Urn(file_path).parent() ext = '' divide = file_name.split('.') if len(divide) > 1: ext = file_name.split('.')[-1].lower() mtime = info['modified'] file_info = { "is_dir": is_dir, "is_link": is_link, "name": file_name, "ext": ext, "path": file_dir, "owner": self.user, "mode": "600", "size": info['size'] if not is_dir else 0, "mtime": mtime, 'mtime_str': str(mtime), } return file_info def _make_file_info(self, file_queue, result_queue, logger, timeout): while int(time.time()) < timeout: if file_queue.empty() is not True: file_path = file_queue.get() try: file_info = self.generate_file_info(file_path) result_queue.put(file_info) except UnicodeDecodeError as unicode_e: logger.error( "UnicodeDecodeError %s, %s" % (str(unicode_e), traceback.format_exc())) except IOError as io_e: logger.error("IOError %s, %s" % (str(io_e), traceback.format_exc())) except Exception as other_e: logger.error("Exception %s, %s" % (str(other_e), traceback.format_exc())) finally: file_queue.task_done() else: time.sleep(REQUEST_DELAY) @staticmethod def to_byte(value): if isinstance(value, str): try: value = value.encode("utf-8") except UnicodeDecodeError: value = value.encode("ISO-8859-1") return value def size(self, path): try: return self.webdavClient.info(path)['size'] except Exception as e: self.logger.error("Error in WebDav size(): %s, traceback = %s" % (str(e), traceback.format_exc())) return 0 def info(self, path): return self.webdavClient.info(self.to_byte(path)) def exists(self, path): return self.webdavClient.check(path) def isdir(self, path): return self.webdavClient.is_dir(path) def isfile(self, path): return not self.webdavClient.is_dir(self.to_byte(path)) def list(self, path): flist = { "path": path, "items": [] } try: self.webdavClient.check('/') except Exception: raise Exception("Error during establishing webdav connection") listdir = self.webdavClient.list(self.to_byte(path)) self.logger.info("listdir=%s", listdir) time_limit = int(time.time()) + TIMEOUT_LIMIT self.file_queue = JoinableQueue(maxsize=0) self.result_queue = Queue(maxsize=0) for i in range(self.NUM_WORKING_PROCESSES): p = Process(target=self._make_file_info, args=(self.file_queue, self.result_queue, self.logger, time_limit)) p.start() proc = psutil.Process(p.pid) proc.ionice(psutil.IOPRIO_CLASS_IDLE) proc.nice(20) self.logger.debug( "ListDir worker #%s, set ionice = idle and nice = 20 for pid %s" % ( str(i), str(p.pid))) self.processes.append(p) for name in listdir: try: item_path = '{0}/{1}'.format(path, name) self.file_queue.put(item_path) except UnicodeDecodeError as e: self.logger.error( "UnicodeDecodeError %s, %s" % (str(e), traceback.format_exc())) except IOError as e: self.logger.error("IOError %s, %s" % (str(e), traceback.format_exc())) except Exception as e: self.logger.error( "Exception %s, %s" % (str(e), traceback.format_exc())) while not self.file_queue.empty(): self.logger.debug("file_queue size = %s , empty = %s (timeout: %s/%s)" % ( self.file_queue.qsize(), self.file_queue.empty(), str(int(time.time())), time_limit)) time.sleep(REQUEST_DELAY) if self.file_queue.empty(): self.logger.debug("join() file_queue until workers done jobs") self.file_queue.join() for p in self.processes: try: self.logger.debug("WebDav ListDir terminate worker process, pid = %s" % p.pid) kill(p.pid, signal.SIGKILL, self.logger) except OSError: self.logger.error( "ListDir unable to terminate worker process, pid = %s" % p.pid) if self.is_alive['status'] is True: while not self.result_queue.empty(): file_info = self.result_queue.get() flist["items"].append(file_info) return flist def listdir(self, path): listdir = self.webdavClient.list(path) listing = [] for name in listdir: item_path = '{0}/{1}'.format(path, name) listing.append(item_path) return listing def remove(self, target): try: self.logger.debug("Removing target=%s" % target) if self.isdir(target): target += '/' self.webdavClient.unpublish(target) self.webdavClient.clean(target) except Exception as e: self.logger.error("Error in WebDav dir remove(): %s, traceback = %s" % (str(e), traceback.format_exc())) raise Exception def mkdir(self, path): self.logger.debug("Creating directory=%s" % path) return self.webdavClient.mkdir(path) def upload(self, source, target, overwrite=False, rename=None, operation_progress=None): result = {} file_list = {} succeed = [] failed = [] try: if rename is not None: target_path = os.path.join(target, rename) else: target_path = os.path.join(target, source) if not overwrite and self.exists(target_path): failed.append(source) raise Exception("File '%s' already exists and overwrite not permitted" % target_path) try: self.logger.debug("Uploading target_path=%s, source=%s" % (target_path, source)) self.webdavClient.upload(target_path, source, operation_progress) except Exception as e: failed.append(source) self.logger.error("Error in WebDav upload(): %s, traceback = %s" % (str(e), traceback.format_exc())) raise Exception("Error during file uploading %s" % traceback.format_exc()) succeed.append(source) file_list['succeed'] = succeed file_list['failed'] = failed result['success'] = True result['error'] = None result['file_list'] = file_list return result except Exception as e: self.logger.error("Error in WebDav upload(): %s, traceback = %s" % (str(e), traceback.format_exc())) file_list['succeed'] = succeed file_list['failed'] = failed result['success'] = False result['error'] = e result['file_list'] = file_list return result def download(self, source, target, operation_progress=None): result = {} file_list = {} succeed = [] failed = [] try: target_path = os.path.join(target, os.path.basename(source)) try: self.logger.debug("Downloading source=%s, target_path=%s" % (source, target_path)) self.webdavClient.download(source, target_path, operation_progress) except Exception as e: failed.append(source) self.logger.error("Error in WebDav download(): %s, traceback = %s" % (str(e), traceback.format_exc())) raise Exception("Error during file download") succeed.append(source) file_list['succeed'] = succeed file_list['failed'] = failed result['success'] = True result['error'] = None result['file_list'] = file_list return result except Exception as e: self.logger.error("Error in WebDav download(): %s, traceback = %s" % (str(e), traceback.format_exc())) file_list['succeed'] = succeed file_list['failed'] = failed result['success'] = False result['error'] = e result['file_list'] = file_list return result def copy_file(self, source, target, overwrite=False): result = {} file_list = {} succeed = [] failed = [] try: if not overwrite and self.exists(target): failed.append(source) raise Exception('file exist and cannot be overwritten') try: self.logger.debug("Copying file source=%s, target=%s" % (source, target)) self.webdavClient.copy(source, target) except Exception as e: failed.append(source) raise Exception('Cannot copy file %s' % (e,)) succeed.append(source) file_list['succeed'] = succeed file_list['failed'] = failed result['success'] = True result['error'] = None result['file_list'] = file_list return result except Exception as e: file_list['succeed'] = succeed file_list['failed'] = failed result['success'] = False result['error'] = e result['file_list'] = file_list return result def move_file(self, source, target, overwrite=False): result = {} file_list = {} succeed = [] failed = [] try: if not overwrite and self.exists(target): failed.append(source) raise Exception('file exist and cannot be overwritten') try: self.logger.debug("Moving file source=%s, target=%s" % (source, target)) self.webdavClient.move(source, target) except Exception as e: failed.append(source) raise Exception('Cannot move file %s' % (e,)) succeed.append(source) file_list['succeed'] = succeed file_list['failed'] = failed result['success'] = True result['error'] = None result['file_list'] = file_list return result except Exception as e: file_list['succeed'] = succeed file_list['failed'] = failed result['success'] = False result['error'] = e result['file_list'] = file_list return result def make_destination_dir(self, destination, overwrite): self.logger.info("making destination %s" % destination) if not self.exists(destination): self.mkdir(destination) elif overwrite and self.exists(destination) and not self.isdir(destination): self.remove(destination) self.mkdir(destination) elif not overwrite and self.exists(destination) and not self.isdir(destination): raise Exception("destination is not a dir") else: pass
db = psycopg2.connect("dbname='blockchain_ethereum4' user='******' host='localhost' password=''", cursor_factory=psycopg2.extras.DictCursor) cursor = db.cursor() cursor.execute("SELECT DISTINCT(address) FROM ethereum.Addresses"); work_queue = JoinableQueue() addresses_count = 0 for i in range(NUM_WORKERS): CrawlerWorkerProcess(work_queue).start() for row in cursor.fetchall(): work_queue.put(row.get("address")) addresses_count += 1 print("Looking up %d addresses on Etherscan.io using %d workers." % (addresses_count, NUM_WORKERS)) last_time = time.time() previous_addresses_left = work_queue.qsize() while work_queue.qsize() > 0: time_now = time.time() time_since = (time_now - last_time) addresses_left = work_queue.qsize() addresses_done_since_last = previous_addresses_left - addresses_left try: time_left = int(addresses_left * (time_since/addresses_done_since_last)) except ZeroDivisionError: time_left = 9999999999999 m,s = divmod(time_left, 60) h,m = divmod(m, 60) time_remaining_string = "%dh:%02dm:%02ds" % (h,m,s) print("#" * 50) print("###### Last %d addresses took: % 4.2fs - %d addresses left (%s) ######" % (addresses_done_since_last, time_since, addresses_left, time_remaining_string))
def main(factor = 2): #E.G: if total cores is 2 , no of processes to be spawned is 2 * factor files_to_download = JoinableQueue() result_queue = JoinableQueue() time_taken = JoinableQueue() time_taken_to_read_from_queue = JoinableQueue() with open('downloads.txt', 'r') as f: for to_download in f: files_to_download.put_nowait(to_download.split('\n')[0]) files_to_download_size = files_to_download.qsize() cores = cpu_count() no_of_processes = cores * factor for i in xrange(no_of_processes): files_to_download.put_nowait(None) jobs = [] start = datetime.datetime.now() for name in xrange(no_of_processes): p = Process(target = download, args = (files_to_download, result_queue,\ time_taken, time_taken_to_read_from_queue,name)) p.start() jobs.append(p) for job in jobs: job.join() print result_queue.qsize() total_downloaded_urls = 0 try: while 1: r = result_queue.get_nowait() total_downloaded_urls += r except Empty: pass try: while 1: """ locals() keeps track of all variable, functions, class etc. datetime object is different from int, one cannot perform 0 + datetime.datetime.now(), if when we access the queue which contains time objects first time, total_time will be set to first time """ if 'total_time' in locals(): total_time += time_taken.get_nowait() else: total_time = time_taken.get_nowait() except Empty: print("{0} processes on {1} core machine took {2} time to download {3}\ urls".format(no_of_processes, cores, total_time, \ total_downloaded_urls)) try: while 1: if 'queue_reading_time' in locals(): queue_reading_time += time_taken_to_read_from_queue.get_nowait() else: queue_reading_time = time_taken_to_read_from_queue.get_nowait() except Empty: print("{0} processes on {1} core machine took {2} time to read {3}\ urls from queue".format(no_of_processes, cores,queue_reading_time\ ,files_to_download_size))
class TagCounter: def __init__(self, domain, input_col): self.domain = domain self.input_col = input_col self.htmls_queue = JoinableQueue() self.tags_queue = Queue() self.tag_count = {} def count_tags(self): tag_count_start_time = datetime.now() self.fill_htmls_queue() print("Input htmls_queue size: ", self.htmls_queue.qsize()) pool = Pool(settings.NUM_PROCESSORS, self.processor) self.htmls_queue.join() print("Output tags_queue size: ", self.tags_queue.qsize()) tags_merge_start_time = datetime.now() while True: # TODO: move tags merging to separete process (>20% processing time) try: tags_batch = self.tags_queue.get(True, 2) except Empty: print( "No batches in tags_queue. tags_queue size: %d htmls_queue size: %d" % (self.tags_queue.qsize(), self.htmls_queue.qsize())) break for tag, count in tags_batch.items(): if self.tag_count.get(tag): self.tag_count[tag] += count else: self.tag_count[tag] = count finish_time = datetime.now() print("Tags merging time: %s" % (finish_time - tags_merge_start_time)) print("Tags counting time: %s" % (finish_time - tag_count_start_time)) pool.close( ) # TODO: why if we are closing pool before tags are merged tags_queue.get is getting stucked (?) # def persist_tags(): saves to pickle file or Mongo def fill_htmls_queue(self): domain_htmls = list(self.input_col.find( {'domain': self.domain})) # TODO: get slices from mongo # TODO: check amount of tags in slowly processed HTMLs batch_size = settings.BATCH_SIZE html_batch_start_time = datetime.now() while domain_htmls: batch = domain_htmls[:batch_size] self.htmls_queue.put( batch) # TODO: move html_batching to separete process domain_htmls = domain_htmls[batch_size:] print("HTMLs batching time: %s" % (datetime.now() - html_batch_start_time)) def processor(self): pages_processed = 0 pid = os.getpid() while True: tag_count = {} try: htmls_batch = self.htmls_queue.get( True, 2) # TODO: test changing timeout except Empty: print( "No batches in htmls_queue, pid: %d tags_queue size: %d htmls_queue size: %d" % (pid, self.tags_queue.qsize(), self.htmls_queue.qsize()) ) # TODO: check if htmls_queue still exist at that moment break print("Process %d created, batch len: %d" % (pid, len(htmls_batch))) for page in htmls_batch: if page.get('no_repeat_html'): soup = BeautifulSoup(page['no_repeat_html'], 'html.parser') else: soup = BeautifulSoup(page['full_page_html'], 'html.parser') for tag in soup.find_all(): stag = str(tag).strip().replace(" ", "").lower( ) # TODO: add tag processing here as a separate function if stag in tag_count: tag_count[stag] += 1 else: tag_count[stag] = 1 pages_processed += 1 if pages_processed % 100 == 0: print('%s, %s, pages_processed = %d, len(tag_count) = %d' % (pid, str(datetime.now()), pages_processed, len(tag_count))) self.tags_queue.put(tag_count) self.htmls_queue.task_done() def sort_tags(self): self.tag_count = sorted(self.tag_count.items(), key=operator.itemgetter(1), reverse=True)
class Engine(Process): """ Based off `threading.Thread` Instantiated in server Contains buffer, queue for processors, processors, ConsumerThread. """ def __init__(self, engine_conn, processors=4, buffer_roll=0, buffer_max_batch=50, buffer_max_seconds=1, test_mode=False, test_outfile='engine_test_output/engine_test_output'): """ Initializes with empty buffer & queue, set # of processors... :param processors: number of processors to start :type processors: int """ logger.info("Initializing EngineThread") super().__init__() self.test_run = test_mode self.test_outfile = test_outfile self.test_batches = {} self.pipe_conn = engine_conn self.buffers_out_q = JoinableQueue() self.number_of_processors = processors self.processors = [] self.run_engine = False self.buffer_record_limit = int(buffer_max_batch) self.buffer_time_limit_s = float(buffer_max_seconds) self.buffers = {} self.buffer_in_qs = {} self.buffer_workers = {} self.data_pullers = {} self.buffer_roll = -buffer_roll if buffer_roll > 0: self.buffer_roll_index = -buffer_roll else: self.buffer_roll_index = None def _init_processors(self): """Initializes + starts set number of processors""" for n in range(self.number_of_processors): processor = Processor(self.buffers_out_q, self.test_run) processor.start() self.processors.append(processor) def _new_buffer(self, partition_key): if partition_key not in self.buffers: self.buffers[partition_key] = np.array([{ 0: 0 }] * (self.buffer_record_limit * self.number_of_processors)).reshape( self.number_of_processors, self.buffer_record_limit) if self.test_run: self.test_batches[partition_key] = 1 if partition_key not in self.buffer_in_qs: self.buffer_in_qs[partition_key] = Queue() else: logger.warn( f"New buffer, existing buffer_in_q for stream {partition_key}" ) if partition_key not in self.buffer_workers: self.buffer_workers[partition_key] = Thread( target=self.run_buffer, args=[partition_key]) self.buffer_workers[partition_key].start() else: logger.warn( f"New buffer, existing buffer_worker for stream {partition_key}" ) return True else: return False def _new_data_puller(self, partition_key, template): if partition_key not in self.data_pullers: if partition_key in self.buffer_in_qs: self.data_pullers[partition_key] = DataPuller( template, self.buffer_in_qs[partition_key]) self.data_pullers[partition_key].start() else: raise ValueError( f"Attempting to init data_puller for stream {partition_key}, buffer input queue for stream does not exist" ) return True else: return False def run(self): """ Sets up numpy array buffer and puts stuff in and gets stuff out """ self._init_processors() self.run_engine = True while self.run_engine: if self.pipe_conn.poll(): item = self.pipe_conn.recv() # branch 2 - stop engine if item == "stop_poison_pill": for q in self.buffer_in_qs.keys(): self.buffer_in_qs[q].put("stop_buffer_worker") self.run_engine = False break # branch 1 - engine running, good data elif type(item) is tuple: partition_key = item[0]['stream_token'] new_buffer = self._new_buffer(partition_key) if new_buffer: logger.info( f"Initialized buffer for stream {partition_key}") if item[1] == "new": if item[0]["data_rules"]["pull"] is True: new_puller = self._new_data_puller( partition_key, item[0]) if new_puller: print( f"Initialized data puller for stream {partition_key}" ) else: logger.warn( f"Attempting to initialize data puller for stream {partition_key} - puller already exists" ) elif item[1] == "load": self.buffer_in_qs[partition_key].put(item[0]) else: raise TypeError( "Invalid tuple in pipe - index 1 must be str 'load' or str 'new'" ) else: raise TypeError("Invalid item in pipe") logger.info("Terminating Engine Thread") self.stop_engine() def run_buffer(self, partition_key): last_col = self.buffer_record_limit - 1 last_row = self.number_of_processors - 1 cur_row = 0 cur_col = 0 batch_tracker = {'start_time': time(), 'leftos_collected': False} while self.run_engine: try: item = self.buffer_in_qs[partition_key].get( timeout=self.buffer_time_limit_s) # branch 2 - stop engine if item == "stop_buffer_worker": break # branch 1 - engine running, good data elif isinstance( item, DStream) or (type(item) is dict and "stream_token" in item.keys()): if "data_rules" in item.keys( ): # some unit test data doesnt have this field if "date_format" in item["data_rules"].keys(): if item["data_rules"]["date_format"] is not None: item["timestamp"] = datetime.strptime( item["timestamp"], item["data_rules"] ["date_format"]).timestamp() # branch 1.1 - not last row if cur_row < last_row: # branch 1.1a - not last column, continue row if cur_col < last_col: logger.info("Buffering- row {}".format(cur_row)) self.buffers[partition_key][cur_row, cur_col] = item cur_col += 1 # branch 1.1b - last column, start new row else: self.buffers[partition_key][cur_row, cur_col] = item if self.test_run: self.buffers_out_q.put(( self.buffers[partition_key] [cur_row].copy(), f"{self.test_outfile}_{partition_key}_{self.test_batches[partition_key]}.txt" )) self.test_batches[partition_key] += 1 else: self.buffers_out_q.put( self.buffers[partition_key] [cur_row].copy()) logger.info("New batch queued") roll_window = self.buffers[partition_key][ cur_row, self.buffer_roll_index:] cur_row += 1 for n in roll_window: for i in range(abs(self.buffer_roll)): self.buffers[partition_key][cur_row, i] = n cur_col -= cur_col + self.buffer_roll # REMOVE batch_tracker['start_time'] = time() # branch 1.2 - last row else: # branch 1.2a - not last column, continue row if cur_col < last_col: self.buffers[partition_key][cur_row, cur_col] = item cur_col += 1 # branch 1.2b - last column, start return to first row in new cycle else: self.buffers[partition_key][cur_row, cur_col] = item if self.test_run: self.buffers_out_q.put(( self.buffers[partition_key] [cur_row].copy(), f"{self.test_outfile}_{partition_key}_{self.test_batches[partition_key]}.txt" )) self.test_batches[partition_key] += 1 else: self.buffers_out_q.put( self.buffers[partition_key] [cur_row].copy()) roll_window = self.buffers[partition_key][ cur_row, self.buffer_roll_index:] cur_row -= cur_row for n in roll_window: for i in range(abs(self.buffer_roll)): self.buffers[partition_key][cur_row, i] = n cur_col -= cur_col + self.buffer_roll batch_tracker['start_time'] = time() batch_tracker['leftos_collected'] = False # branch 3 bad data else: raise TypeError("Queued item is not valid dictionary.") except: # buffer time max reached, engine still running logger.info("Buffer batch timeout exceeded") if self.run_engine is True: # engine running, batch timeout with new buffer data (partial row) if cur_col > abs( self.buffer_roll ) and batch_tracker['leftos_collected'] is False: logger.info( "Collecting leftovers- pushing partial batch to queue after batch timeout" ) if self.test_run: self.buffers_out_q.put(( self.buffers[partition_key][ cur_row, :cur_col].copy(), f"{self.test_outfile}_{partition_key}_{self.test_batches[partition_key]}.txt" )) self.test_batches[partition_key] += 1 else: self.buffers_out_q.put(self.buffers[partition_key][ cur_row, :cur_col].copy()) if cur_row < last_row: cur_row += 1 else: cur_row -= cur_row cur_col -= cur_col batch_tracker['start_time'] = time() batch_tracker['leftos_collected'] = True # leftovers already collected else: logger.info("No new data- resetting batch timer") batch_tracker['start_time'] = time() def stop_engine(self): self.pipe_conn.close() if self.run_engine is True: self.run_engine = False for p in self.data_pullers.keys(): self.data_pullers[p].pulling = False logger.info(self.buffers_out_q.qsize()) self.buffers_out_q.join() logger.info("Queue joined") for p in self.processors: logger.info("Putting poison pills in Q") self.buffers_out_q.put("666_kIlL_thE_pROCess_666") logger.info("Poison pills done") for p in self.processors: p.join() logger.info("Engine shutdown- processor joined") print("done")
paths = findFilesInDir('/mnt/ephemeral0/xml/') unsearched = Queue() for path in paths: unsearched.put(path) print("Number of files", len(paths)) if DEBUG: # Run one process print("DEBUG: Running single threaded.") parallel_worker() else: print("Starting pool") NUM_WORKERS = 7 pool = Pool(NUM_WORKERS) results = [pool.apply_async(parallel_worker) for i in range(NUM_WORKERS)] print("Running progress capture.") while (True): remaining = unsearched.qsize() print "Waiting for", remaining, "tasks to complete..." time.sleep(0.5) # print [result.get() for result in results] unsearched.join() print 'Done'
logging.debug("I should be exiting.") if __name__ == "__main__": start_time = datetime.now() q = JoinableQueue(MAX_QUEUE_SIZE) out_queue = JoinableQueue() # c = Counter() qf = QueueFiller(q, 1, MAX_PRIME_NUMBER) qf.start() tasks = [Task(q, out_queue) for i in range(NUM_WORKERS)] for w in tasks: w.start() logging.info("Items left in queue: {0}".format(q.qsize())) logging.debug("Joining q") # q.join() # qf.join() if False: processes_active = True while processes_active: for w in tasks: processes_active = False or w.is_alive() logging.debug(w.is_alive()) sleep(0.2) for y in tasks: y.join()
class ProcessPool: """ Class which enables multiprocess calls to custom functions """ class Shared: """ Object shared between processes. Sync'd by the BaseManager """ def __init__(self): self.clear() def get(self): return self.data def add(self, val): self.data.append(val) def clear(self): self.data = [] def __init__(self, processes_count, *args, **kwargs): self.sleep_length = 2 self.processes_count = processes_count self.queue_jobs = JoinableQueue() self.processes = [] BaseManager.register('Shared', self.Shared) self.manager = BaseManager() self.manager.start() self.shared = self.manager.Shared() for i in range(self.processes_count): p = Process(target=self.make_pool_call) p.id = i p.start() self.processes.append(p) def make_pool_call(self): while True: item_pickled = self.queue_jobs.get() if item_pickled is None: self.queue_jobs.task_done() break item = dill.loads(item_pickled) call = item.get('call') args = item.get('args') kwargs = item.get('kwargs') try: result = call(*args, **kwargs) self.shared.add(result) except Exception as e: import traceback traceback.print_exc() os.kill(os.getpid(), signal.SIGUSR1) self.queue_jobs.task_done() def add_job(self, job): """ :param: job: has to be a dilled dict: { 'call': function_to_be_called_by_process, 'args': [], 'kwargs': {}, } """ self.queue_jobs.put(job) def finish_pool_queue(self): while self.queue_jobs.qsize() > 0: sleep(self.sleep_length) for i in range(self.processes_count): self.queue_jobs.put(None) self.queue_jobs.join() self.queue_jobs.close() for p in self.processes: p.join() del self.processes[:] def get_pool_results(self): return self.shared.get() def clear_pool_results(self): self.shared.clear()
class MultiprocessingGlycoproteinSiteModelBuildingWorkflow( GlycoproteinSiteModelBuildingWorkflowBase): def __init__(self, analyses, glycopeptide_database, glycan_database, unobserved_penalty_scale=None, lambda_limit=0.2, require_multiple_observations=True, observation_aggregator=None, output_path=None, n_threads=4, q_value_threshold=0.05): super(MultiprocessingGlycoproteinSiteModelBuildingWorkflow, self).__init__(analyses, glycopeptide_database, glycan_database, unobserved_penalty_scale, lambda_limit, require_multiple_observations, observation_aggregator, output_path, q_value_threshold=q_value_threshold) self.builder = None self.input_queue = JoinableQueue(1000) self.output_queue = JoinableQueue(1000) self.input_done_event = Event() self.n_threads = 1 self.n_workers = n_threads self.workers = [] self._has_remote_error = False self.ipc_manager = self.ipc_logger() def prepare_glycoprotein_for_dispatch(self, glycoprotein, builder): prepared = builder.prepare_glycoprotein(glycoprotein) return prepared def feed_queue(self, glycoproteins, builder): n = len(glycoproteins) n_sites = self.count_glycosites(glycoproteins) self.log("Analyzing %d glycoproteins with %d occupied N-glycosites" % (n, n_sites)) i_site = 0 for glycoprotein in glycoproteins: prepared = self.prepare_glycoprotein_for_dispatch( glycoprotein, builder) for work_item in prepared: i_site += 1 self.input_queue.put(work_item) if i_site % 50 == 0 and i_site != 0: self.input_queue.join() self.input_done_event.set() def _handle_local(self, glycoproteins, builder, seen): for glycoprotein in glycoproteins: prepared = self.prepare_glycoprotein_for_dispatch( glycoprotein, builder) for records, site, protein_stub in prepared: key = (protein_stub.name, site) if key in seen: continue else: seen[key] = -1 model = builder.fit_site_model(records, site, protein_stub) if model is not None: self.builder.site_models.append(model) def make_workers(self): for _i in range(self.n_workers): worker = GlycositeModelBuildingProcess( self.builder, self.input_queue, self.output_queue, producer_done_event=self.input_done_event, output_done_event=Event(), log_handler=self.ipc_manager.sender()) self.workers.append(worker) worker.start() def clear_pool(self): for _i, worker in enumerate(self.workers): exitcode = worker.exitcode if exitcode != 0 and exitcode is not None: self.log("... Worker Process %r had exitcode %r" % (worker, exitcode)) try: worker.join(1) except AttributeError: pass if worker.is_alive(): self.debug( "... Worker Process %r is still alive and incomplete" % (worker, )) worker.terminate() def all_workers_finished(self): """Check if all worker processes have finished. """ worker_still_busy = False assert self.workers for worker in self.workers: try: is_done = worker.all_work_done() if not is_done: worker_still_busy = True break except (RemoteError, KeyError) as err: worker_still_busy = True self._has_remote_error = True break return not worker_still_busy def _fit_glycoprotein_site_models(self, glycoproteins, builder): self.builder = builder feeder_thread = Thread(target=self.feed_queue, args=(glycoproteins, builder)) feeder_thread.daemon = True feeder_thread.start() self.make_workers() n_sites = self.count_glycosites(glycoproteins) seen = dict() strikes = 0 start_time = time.time() i = 0 has_work = True while has_work: try: site_model = self.output_queue.get(True, 3) self.output_queue.task_done() key = (site_model.protein_name, site_model.position) seen[(key)] = i if key in seen: self.debug( "...... Duplicate Results For %s. First seen at %r, now again at %r" % ( key, seen[key], i, )) else: seen[key] = i i += 1 strikes = 0 if i % 1 == 0: self.log("...... Processed %d sites (%0.2f%%)" % (i, i * 100. / n_sites)) if not isinstance(site_model, EmptySite): self.builder.site_models.append(site_model) except QueueEmptyException: if len(seen) == n_sites: has_work = False # do worker life cycle management here elif self.all_workers_finished(): if len(seen) == n_sites: has_work = False else: strikes += 1 if strikes % 25 == 0: self.log( "...... %d cycles without output (%d/%d, %0.2f%% Done)" % (strikes, len(seen), n_sites, len(seen) * 100. / n_sites)) self.debug("...... Processes") for worker in self.workers: self.debug("......... %r" % (worker, )) self.debug("...... IPC Manager: %r" % (self.ipc_manager, )) if strikes > 1000: self.log( "Too much time has elapsed waiting for final results, finishing locally." ) self._handle_local(glycoproteins, builder, seen) else: strikes += 1 if strikes % 50 == 0: self.log( "...... %d cycles without output (%d/%d, %0.2f%% Done, %d children still alive)" % (strikes, len(seen), n_sites, len(seen) * 100. / n_sites, len(multiprocessing.active_children()) - 1)) try: input_queue_size = self.input_queue.qsize() except Exception: input_queue_size = -1 is_feeder_done = self.input_done_event.is_set() self.log( "...... Input Queue Status: %r. Is Feeder Done? %r" % (input_queue_size, is_feeder_done)) if strikes > 1000: self.log( "Too much time has elapsed waiting for workers, finishing locally." ) self._handle_local(glycoproteins, builder, seen) continue self.clear_pool() self.ipc_manager.stop() feeder_thread.join() dispatcher_end = time.time() self.log("... Dispatcher Finished (%0.3g sec.)" % (dispatcher_end - start_time))
class HadronicRunner2: def __init__(self, m2, q2, Delta, nlf, pdfs, pdfMem, mu02, aS, fs, fp, nProcesses = cpu_count()): # parameters self.m2 = m2 self.q2 = q2 self.Delta = Delta self.nlf = nlf self.pdfs = pdfs self.pdfMem = pdfMem self.mu02 = mu02 self.aS = aS self.fs = fs self.fp = fp self.nProcesses = nProcesses # vars self.__qIn = JoinableQueue() self.__qOut = Queue() self.__js = [] self.__jps = [] self.__ks = [] self.__params = [] self.__paramps = [] self.__data = {} self.__processes = [] # setup x grid def _getGridX(self,Nx): if (Nx < 2): raise "invalid argument! Nx >= 2!" self.__js = range(Nx) self.__ks = range(len(self.fs)) self.__params = [10.**(-4./(Nx-1)*j) for j in self.__js] g = [] for proj in ["G", "L", "P"]: for j in self.__js: for k in self.__ks: g.append({"proj": proj, "j": j, "x": self.__params[j], "k": k, "f": self.fs[k], "res": np.nan}) return g # setup mu2 grid def _getGridMu2(self,x,r,Nmu2,getAlphaS): if (Nmu2 < 2): raise "invalid argument! Nmu2 >= 2!" self.__js = range(Nmu2) self.__ks = range(len(self.fs)) self.__params = [r**(-1.+2./(Nmu2-1)*j) for j in self.__js] g = [] for proj in ["G", "L", "P"]: for j in self.__js: mu2 = self.mu02*self.__params[j] aS = getAlphaS(mu2) for k in self.__ks: g.append({"proj": proj, "x": x, "j": j, "mu2": mu2, "alphaS": aS,"k": k, "f": self.fs[k], "res": np.nan}) return g # setup m2 grid def _getGridM2(self,x,m2min,m2max,Nm2,getMu2,getAlphaS): if (Nm2 < 2): raise "invalid argument! Nm2 >= 2!" self.__js = range(Nm2) self.__ks = range(len(self.fs)) self.__params = [m2min + (m2max-m2min)/(Nm2-1)*j for j in self.__js] g = [] for proj in ["G", "L", "P"]: for j in self.__js: m2 = self.__params[j] mu2 = getMu2(m2) aS = getAlphaS(mu2) for k in self.__ks: g.append({"proj": proj, "x": x, "j": j, "m2": m2, "mu2": mu2, "alphaS": aS, "k": k, "f": self.fs[k], "res": np.nan}) return g # setup muR2-muF2 grid def _getGridMuR2MuF2(self,x,rR,NmuR2,rF,NmuF2,getAlphaS): if (NmuF2 < 2 or NmuR2 < 2): raise "invalid argument! NmuF2 >= 2, NmuR2 >= 2!" self.__js = range(NmuR2) self.__jps = range(NmuF2) self.__ks = range(len(self.fs)) self.__params = [rR**(-1.+2./(NmuR2-1)*j) for j in self.__js] self.__paramps = [rF**(-1.+2./(NmuF2-1)*jp) for jp in self.__jps] g = [] for proj in ["G", "L", "P"]: for j in self.__js: muR2 = self.mu02*self.__params[j] aS = getAlphaS(muR2) for jp in self.__jps: muF2 = self.mu02*self.__paramps[jp] for k in self.__ks: g.append({"proj": proj, "x": x, "j": j, "muR2": muR2, "alphaS": aS, "jp": jp, "muF2": muF2,"k": k, "f": self.fs[k], "res": np.nan}) return g # setup pdf grid def _getGridPdf(self,Nx,proj, pdf, Npdfmem): if (Nx < 2): raise "invalid argument! Nx >= 2!" if (Npdfmem < 0): raise "invalid argument! Npdfmem >= 0!" self.__js = range(Nx) self.__jps = range(Npdfmem+1) self.__ks = range(len(self.fs)) self.__params = [10.**(-4./(Nx-1)*j) for j in self.__js] g = [] for j in self.__js: x = self.__params[j] for k in self.__ks: for pdfMem in self.__jps: g.append({"proj": proj, "j": j, "x": x, "k": k, "f": self.fs[k], "pdf": pdf, "pdfMem": pdfMem, "res": np.nan}) return g # start processes def _compute(self,g): self.__qIn = JoinableQueue() # fill for e in g: self.__qIn.put(e) print _pinfo(),"computing %d elements"%self.__qIn.qsize() # add EOF for n in xrange(self.nProcesses): self.__qIn.put(None) self.__qOut = Queue() # start processes oArgs = { "G": (self.m2,self.q2,self.Delta,ElProduction.projT.G,self.nlf,), "L": (self.m2,self.q2,self.Delta,ElProduction.projT.L,self.nlf,), "P": (self.m2,self.q2,self.Delta,ElProduction.projT.P,self.nlf,) } lenParams = len(g) processes = [] threadArgs = (self.__qIn, self.__qOut, oArgs, self.pdfs, self.pdfMem, self.mu02, self.aS, lenParams,) for j in xrange(self.nProcesses): processes.append(Process(target=_threadWorker, args=threadArgs)) [p.start() for p in processes] # run try: self.__qIn.join() except KeyboardInterrupt: [p.terminate() for p in processes] print "\n",_pwarn(),"aborting at",self.__qOut.qsize(),"/",lenParams self.__qIn.close() sys.stdout.write("\n") # reorder in 1D def _reorder1(self): self.__data = {} self.__data["G"] = [[np.nan for k in self.__ks] for j in self.__js] self.__data["L"] = [[np.nan for k in self.__ks] for j in self.__js] self.__data["P"] = [[np.nan for k in self.__ks] for j in self.__js] l = self.__qOut.qsize() while g in range(l): p = self.__qOut.get() self.__data[p["proj"]][p["j"]][p["k"]] = p["res"] # reorder in 2D def _reorder2(self): self.__data = {} self.__data["G"] = [[[np.nan for k in self.__ks] for jp in self.__jps] for j in self.__js] self.__data["L"] = [[[np.nan for k in self.__ks] for jp in self.__jps] for j in self.__js] self.__data["P"] = [[[np.nan for k in self.__ks] for jp in self.__jps] for j in self.__js] l = self.__qOut.qsize() for g in range(l): p = self.__qOut.get() self.__data[p["proj"]][p["j"]][p["jp"]][p["k"]] = p["res"] # reorder in pdf data def _reorderPdf(self): self.__data = {} self.__data["G"] = [[[np.nan for pdfMem in self.__jps] for k in self.__ks] for j in self.__js] self.__data["L"] = [[[np.nan for pdfMem in self.__jps] for k in self.__ks] for j in self.__js] self.__data["P"] = [[[np.nan for pdfMem in self.__jps] for k in self.__ks] for j in self.__js] l = self.__qOut.qsize() for g in range(l): p = self.__qOut.get() self.__data[p["proj"]][p["j"]][p["k"]][p["pdfMem"]] = p["res"] # write data for 1D def _write1(self): with open(self.fp, "w") as f: for j in self.__js: x = self.__params[j] l = ["%e"%x] data2 = [self.__data["G"][j][k]+self.__data["L"][j][k]*3./2. for k in self.__ks] for k in self.__ks: l.append("%e"%data2[k]) for k in self.__ks: l.append("%e"%self.__data["L"][j][k]) for k in self.__ks: l.append("%e"%self.__data["P"][j][k]) f.write("\t".join(l)+"\n") # write data for 2D def _write2(self): with open(self.fp, "w") as f: for j in self.__js: x = self.__params[j] for jp in self.__jps: xp = self.__paramps[jp] l = ["%e"%x, "%e"%xp] data2 = [self.__data["G"][j][jp][k]+self.__data["L"][j][jp][k]*3./2. for k in self.__ks] for k in self.__ks: l.append("%e"%data2[k]) for k in self.__ks: l.append("%e"%self.__data["L"][j][jp][k]) for k in self.__ks: l.append("%e"%self.__data["P"][j][jp][k]) f.write("\t".join(l)+"\n") f.write("\n") # write data for pdf data def _writePdf(self,proj): with open(self.fp, "w") as f: for j in self.__js: x = self.__params[j] l = ["%e"%x] d = self.__data[proj][j] for k in self.__ks: l.append("%e"%np.min(d[k])) l.append("%e"%(d[k][0])) l.append("%e"%np.max(d[k])) f.write("\t".join(l)+"\n") # compute grid in 1D def _run1(self,g): if len(g) == 0: print _pwarn(),"no data!" return self._compute(g) self._reorder1() self._write1() # compute grid in 2D def _run2(self,g): if len(g) == 0: print _pwarn(),"no data!" return self._compute(g) self._reorder2() self._write2() # iterate x def runX(self,Nx): self._run1(self._getGridX(Nx)) # iterate mu2 def runMu2(self,x,r,Nmu2,getAlphaS): self._run1(self._getGridMu2(x,r,Nmu2,getAlphaS)) # iterate m2 def runM2(self,x,m2min,m2max,Nm2,getMu2,getAlphaS): self._run1(self._getGridM2(x,m2min,m2max,Nm2,getMu2,getAlphaS)) # iterate muR2 and muF2 def runMuR2MuF2(self,x,rR,NmuR2,rF,NmuF2,getAlphaS): self._run2(self._getGridMuR2MuF2(x,rR,NmuR2,rF,NmuF2,getAlphaS)) def runPdf(self,Nx,proj, pdf, Npdfmem): g = self._getGridPdf(Nx,proj, pdf, Npdfmem) if len(g) == 0: print _pwarn(),"no data!" return self._compute(g) self._reorderPdf() self._writePdf(proj)
def _parallely_make_dataset(self): #Get video_list and video_length files if they exist name_file = "{}/video_list.npy".format(self.loc) len_file = "{}/video_lengths.npy".format(self.loc) if os.path.isfile(name_file): video_list = np.load(name_file) video_lengths = np.load(len_file) return video_list, video_lengths #Files don't yet exist, so create them q = Queue() qvideo_list = Queue() #Collect list of videos to index fnames_list = [] if self.split_file != None: with open(self.split_file, 'r') as f: line = f.readline() while line: fnames_list.append( self.line_to_fname( os.path.join(self.data_dir, line.strip()))) line = f.readline() else: for root, _, fnames in tqdm(os.walk(self.data_dir)): for fname in sorted(fnames): fnames_list.append(os.path.join(root, fname)) #Truncate list if necessary if self.limit is not None: fnames_list = fnames_list[:self.limit] #Parallely open videos, get length def parallel_worker(fnames_chunk): item = q.get() for fname in tqdm(fnames_chunk): if has_file_allowed_extension(fname, VIDEO_EXTENSION): video_path = fname vc = cv2.VideoCapture(video_path) length = int(vc.get(cv2.CAP_PROP_FRAME_COUNT)) if length > 0 and vc.isOpened(): qvideo_list.put((video_path, length)) qvideo_list.task_done() vc.release() q.task_done() processes = self.parallel_processes if self.limit is not None and processes >= self.limit: processes = self.limit n = len(fnames_list) chunk = int(n / processes) if chunk == 0: chunk = 1 fnames_chunks = [fnames_list[i*chunk:(i+1)*chunk] \ for i in range((n + chunk - 1) // chunk)] for i in range(processes): q.put(i) multiprocessing.Process(target=parallel_worker, args=(fnames_chunks[i], )).start() q.join() qvideo_list.join() video_list = [] video_lengths = [] while qvideo_list.qsize() != 0: video, length = qvideo_list.get() video_list.append(video) video_lengths.append(length) np.save(name_file, video_list) np.save(len_file, video_lengths) return video_list, video_lengths
class FindText(BaseWorkerCustomer): NUM_WORKING_PROCESSES = 2 def __init__(self, params, *args, **kwargs): super(FindText, self).__init__(*args, **kwargs) self.path = params.get("path", "/") self.text = params.get("text", "") self.params = params # file queue to be processed by many threads self.file_queue = JoinableQueue(maxsize=0) self.result_queue = Queue(maxsize=0) self.result = [] self.is_alive = {"status": True} self.re_text = re.compile(".*" + fnmatch.translate(self.text)[:-7] + ".*", re.UNICODE | re.IGNORECASE) # remove \Z(?ms) from end of result expression def run(self): try: self.preload() except Exception as e: result = {"error": True, "message": str(e), "traceback": traceback.format_exc()} self.on_error(self.status_id, result, pid=self.pid, pname=self.name) return def worker(re_text, file_queue, result_queue, logger, timeout): while int(time.time()) < timeout: if file_queue.empty() is not True: f_path = file_queue.get() try: if not is_binary(f_path): mime = mimetypes.guess_type(f_path)[0] # исключаем некоторые mime типы из поиска if mime not in ["application/pdf", "application/rar"]: with open(f_path, "rb") as fp: for line in fp: try: line = as_unicode(line) except UnicodeDecodeError: charset = chardet.detect(line) if charset.get("encoding") in ["MacCyrillic"]: detected = "windows-1251" else: detected = charset.get("encoding") if detected is None: break try: line = str(line, detected, "replace") except LookupError: pass if re_text.match(line) is not None: result_queue.put(f_path) # logger.debug("matched file = %s " % f_path) break except UnicodeDecodeError as unicode_e: logger.error("UnicodeDecodeError %s, %s" % (str(unicode_e), traceback.format_exc())) except IOError as io_e: logger.error("IOError %s, %s" % (str(io_e), traceback.format_exc())) except Exception as other_e: logger.error("Exception %s, %s" % (str(other_e), traceback.format_exc())) finally: file_queue.task_done() else: time.sleep(REQUEST_DELAY) try: self.logger.debug("findText started with timeout = %s" % TIMEOUT_LIMIT) time_limit = int(time.time()) + TIMEOUT_LIMIT # Launches a number of worker threads to perform operations using the queue of inputs for i in range(self.NUM_WORKING_PROCESSES): p = Process( target=worker, args=(self.re_text, self.file_queue, self.result_queue, self.logger, time_limit) ) p.start() proc = psutil.Process(p.pid) proc.ionice(psutil.IOPRIO_CLASS_IDLE) proc.nice(20) self.logger.debug( "Search worker #%s, set ionice = idle and nice = 20 for pid %s" % (str(i), str(p.pid)) ) self.processes.append(p) abs_path = self.get_abs_path(self.path) self.logger.debug("FM FindText worker run(), abs_path = %s" % abs_path) if not os.path.exists(abs_path): raise Exception("Provided path not exist") self.on_running(self.status_id, pid=self.pid, pname=self.name) for current, dirs, files in os.walk(abs_path): for f in files: try: file_path = os.path.join(current, f) self.file_queue.put(file_path) except UnicodeDecodeError as e: self.logger.error("UnicodeDecodeError %s, %s" % (str(e), traceback.format_exc())) except IOError as e: self.logger.error("IOError %s, %s" % (str(e), traceback.format_exc())) except Exception as e: self.logger.error("Exception %s, %s" % (str(e), traceback.format_exc())) while int(time.time()) <= time_limit: self.logger.debug( "file_queue size = %s , empty = %s (timeout: %s/%s)" % (self.file_queue.qsize(), self.file_queue.empty(), str(int(time.time())), time_limit) ) if self.file_queue.empty(): self.logger.debug("join() file_queue until workers done jobs") self.file_queue.join() break else: time.sleep(REQUEST_DELAY) if int(time.time()) > time_limit: self.is_alive["status"] = False for p in self.processes: try: self.logger.debug("FM FindText terminate worker process, pid = %s" % p.pid) kill(p.pid, signal.SIGKILL, self.logger) except OSError: self.logger.error("FindText unable to terminate worker process, pid = %s" % p.pid) if self.is_alive["status"] is True: while not self.result_queue.empty(): file_path = self.result_queue.get() self.result.append(self._make_file_info(file_path)) self.on_success(self.status_id, data=self.result, pid=self.pid, pname=self.name) else: result = {"error": True, "message": "Operation timeout exceeded", "traceback": ""} self.on_error(self.status_id, result, pid=self.pid, pname=self.name) except Exception as e: result = {"error": True, "message": str(e), "traceback": traceback.format_exc()} self.on_error(self.status_id, result, pid=self.pid, pname=self.name)
def search(output_dict, rules_file): rules = [rule.split(' | ') for rule in pickle.load(open(rules_file, 'rb'))] file_list = JoinableQueue() word_dict = Manager().dict() for root, subFolders, files in os.walk(os.path.join(os.path.dirname(__file__), 'corpus', 'tagged')): for current_file in files: if current_file.endswith(".pickle"): file_list.put(os.path.join(root, current_file)) #break # TODO remove (only for testing with one file) file_count = file_list.qsize() def worker(): def rule_parser(tagged_data): parser = nltk.RegexpParser(''' NP: {<NN|NNS|NNP|NNPS|NE>} NPs: {<NP> (<,|CC> <NP>)+} ''') return parser.parse(tagged_data) def get_nltk_word(data): if isinstance(data, nltk.tree.Tree): if isinstance(data[0], tuple): return data[0][0] else: return data[0] else: return data[0] def add_to_dict(hypernym, hyponym): if not hyponym in word_dict.keys(): old_list = word_dict.get(hypernym) if not old_list: old_list = [hyponym] else: if not hyponym in old_list: old_list.append(hyponym) word_dict[hypernym] = old_list def apply_rules(data, position): for rule in rules: # search right side if rule[0] == 'HYPERNYM': possible_hypernym = get_nltk_word(data[position]) error = False word_count = 1 for word in rule[1:-1]: try: if word != get_nltk_word(data[position + word_count]): error = True word_count += 1 except IndexError: pass try: if not error: if isinstance(data[position + word_count], nltk.tree.Tree): if data[position + word_count].node == 'NP' and rule[-1] == 'NP': add_to_dict(possible_hypernym, data[position + word_count][0][0]) break elif data[position + word_count].node == 'NPs' and rule[-1] == 'NPs': for node in data[position + word_count]: if isinstance(node, nltk.tree.Tree): add_to_dict(possible_hypernym, node[0][0]) break except IndexError: pass # search left side elif rule[-1] == 'HYPERNYM': possible_hypernym = get_nltk_word(data[position]) error = False word_count = -1 nrule = list(rule) nrule.reverse() for word in nrule[1:-1]: try: if word != get_nltk_word(data[position + word_count]): error = False word_count -= 1 except IndexError: pass try: if not error: if isinstance(data[position + word_count], nltk.tree.Tree): if data[position + word_count].node == 'NP' and rule[-1] == 'NP': add_to_dict(possible_hypernym, data[position + word_count][0][0]) break elif data[position + word_count].node == 'NPs' and rule[-1] == 'NPs': for node in data[position + word_count]: if isinstance(node, nltk.tree.Tree): add_to_dict(possible_hypernym, node[0][0]) break except IndexError: pass while not file_list.empty(): input_file = file_list.get() tagged_data = rule_parser(pickle.load(open(input_file, 'rb'))) for n in range(len(tagged_data)): if isinstance(tagged_data[n], nltk.tree.Tree): if tagged_data[n].node == 'NP': apply_rules(tagged_data, n) percentage = 100.0 - ((float(file_list.qsize()) / float(file_count)) * 100.0) sys.stdout.write("\rProgress: {0:.2f}%".format(percentage)) sys.stdout.flush() file_list.task_done() sys.stdout.write("\rProgress: 0.00%") for pid in range(8): process = Process(target=worker, args=()) process.daemon = True process.start() file_list.join() print('') pickle_dict = dict() for key in word_dict.keys(): pickle_dict[key] = word_dict.get(key) pickle.dump(pickle_dict, open(output_dict, 'wb+'), 2)
class FindText(BaseWorkerCustomer): NUM_WORKING_PROCESSES = 2 def __init__(self, params, session, *args, **kwargs): super(FindText, self).__init__(*args, **kwargs) self.path = params.get('path', '/') self.session = session self.session = session self.text = params.get('text', '') self.params = params # file queue to be processed by many threads self.file_queue = JoinableQueue(maxsize=0) self.result_queue = Queue(maxsize=0) self.result = [] self.is_alive = { "status": True } self.re_text = re.compile('.*' + fnmatch.translate(self.text)[:-7] + '.*', re.UNICODE | re.IGNORECASE) # remove \Z(?ms) from end of result expression def run(self): try: self.preload() sftp = self.get_sftp_connection(self.session) self.logger.debug("findText started with timeout = %s" % TIMEOUT_LIMIT) time_limit = int(time.time()) + TIMEOUT_LIMIT # Launches a number of worker threads to perform operations using the queue of inputs sftp_managers = [] for i in range(self.NUM_WORKING_PROCESSES): p = Process(target=self.worker, args=(self.re_text, self.file_queue, self.result_queue, time_limit)) p.start() proc = psutil.Process(p.pid) proc.ionice(psutil.IOPRIO_CLASS_IDLE) proc.nice(20) self.logger.debug( "Search worker #%s, set ionice = idle and nice = 20 for pid %s" % ( str(i), str(p.pid))) self.processes.append(p) abs_path = self.path self.logger.debug("FM FindText worker run(), abs_path = %s" % abs_path) if not sftp.exists(abs_path): raise Exception("Provided path not exist") self.on_running(self.status_id, pid=self.pid, pname=self.name) for current, dirs, files in sftp.walk(abs_path): for f in files: try: file_path = os.path.join(current, f) self.file_queue.put(file_path) except UnicodeDecodeError as e: self.logger.error( "UnicodeDecodeError %s, %s" % (str(e), traceback.format_exc())) except IOError as e: self.logger.error("IOError %s, %s" % (str(e), traceback.format_exc())) except Exception as e: self.logger.error( "Exception %s, %s" % (str(e), traceback.format_exc())) while int(time.time()) <= time_limit: self.logger.debug("file_queue size = %s , empty = %s (timeout: %s/%s)" % ( self.file_queue.qsize(), self.file_queue.empty(), str(int(time.time())), time_limit)) if self.file_queue.empty(): self.logger.debug("join() file_queue until workers done jobs") self.file_queue.join() break else: time.sleep(REQUEST_DELAY) if int(time.time()) > time_limit: self.is_alive['status'] = False for sftp in sftp_managers: sftp.conn.close() for p in self.processes: try: self.logger.debug("FM FindText terminate worker process, pid = %s" % p.pid) kill(p.pid, signal.SIGKILL, self.logger) except OSError: self.logger.error( "FindText unable to terminate worker process, pid = %s" % p.pid) if self.is_alive['status'] is True: while not self.result_queue.empty(): file_path = self.result_queue.get() self.result.append(sftp.make_file_info(file_path)) self.on_success(self.status_id, data=self.result, pid=self.pid, pname=self.name) else: result = { "error": True, "message": "Operation timeout exceeded", "traceback": "" } self.on_error(self.status_id, result, pid=self.pid, pname=self.name) except Exception as e: result = { "error": True, "message": str(e), "traceback": traceback.format_exc() } self.on_error(self.status_id, result, pid=self.pid, pname=self.name) def worker(self, re_text, file_queue, result_queue, timeout): try: worker_sftp = self.get_sftp_connection(self.session) while int(time.time()) < timeout: if file_queue.empty() is not True: f_path = file_queue.get() try: if not worker_sftp.is_binary(f_path): mime = mimetypes.guess_type(f_path)[0] # исключаем некоторые mime типы из поиска if mime not in ['application/pdf', 'application/rar']: with worker_sftp.open(f_path, 'rb') as fp: for line in fp: try: line = as_unicode(line) except UnicodeDecodeError: charset = chardet.detect(line) if charset.get('encoding') in ['MacCyrillic']: detected = 'windows-1251' else: detected = charset.get('encoding') if detected is None: break try: line = str(line, detected, "replace") except LookupError: pass if re_text.match(line) is not None: result_queue.put(f_path) self.logger.debug("matched file = %s " % f_path) break except UnicodeDecodeError as unicode_e: self.logger.error( "UnicodeDecodeError %s, %s" % (str(unicode_e), traceback.format_exc())) except IOError as io_e: self.logger.error("IOError %s, %s" % (str(io_e), traceback.format_exc())) except Exception as other_e: self.logger.error("Exception %s, %s" % (str(other_e), traceback.format_exc())) finally: file_queue.task_done() else: time.sleep(REQUEST_DELAY) worker_sftp.close() except Exception as e: result = { "error": True, "message": str(e), "traceback": traceback.format_exc() } self.logger.error('SFTP FindText Worker Exception {}'.format(result))
def optimized_compute(self): """ First process: it computes the quilt algorithm with big tiles, manages child processes and them combines the results. 1) creates the child processes (number defined according to the available cores and the number of big tiles in the image) 2) computes quilting with big tiles 3) every time a tile is computed (and sewed with the image), it is put in a queue process 1: big tiles for each of the tile: process n """ self.log.info('\nMULTIPROCESSING COMPUTING ...') big_num_tiles = self.calc_num_tiles(tile_size=self.big_tilesize, overlap=self.big_overlap) # prepare the pool n_proc = min(big_num_tiles[0] * big_num_tiles[1], self.cores) out_queue = Queue() in_queue = JoinableQueue() pool = Pool(n_proc, unwrap_self, ( self, in_queue, out_queue, )) self.log.info('preparing {0} processes - {1}'.format( n_proc, time.strftime("%H:%M:%S"))) if self.Ymask is not None: # zero values will become inf Ymask_rgb = gray2rgb(self.Ymask) # use the mask as a draft of the dst img so that boundaries are # respected self.Y[0] = deepcopy(Ymask_rgb) for i in xrange(big_num_tiles[0]): startI = i * self.big_tilesize - i * self.big_overlap endI = min(self.Y[0].shape[0], startI + self.big_tilesize) sizeI = endI - startI if sizeI <= self.overlap: continue for j in xrange(big_num_tiles[1]): startJ = j * self.big_tilesize - j * self.big_overlap endJ = min(self.Y[0].shape[1], startJ + self.big_tilesize) sizeJ = endJ - startJ if sizeJ <= self.overlap: continue dst_patches = [y[startI:endI, startJ:endJ, :] for y in self.Y] # for the big tiles don't consider the mask, since it would # remove most of the image because the tiles are so big res_patches = self._compute_patch( dst_patches, [sizeI, sizeJ], (i, j), mask=self.Xmask_big, constraint_start=self.constraint_start, err=0.8) # add the mask on top if self.Ymask is not None: res_patches = [ r * Ymask_rgb[startI:endI, startJ:endJ] for r in res_patches ] for idx, res in enumerate(res_patches): self.Y[idx][startI:endI, startJ:endJ, :] = res # make a process start in this big tile _img = [y[startI:endI, startJ:endJ, :] for y in self.Y] _mask = self.Ymask[startI:endI, startJ:endJ] \ if self.Ymask is not None else None _id = (startI, startJ) in_queue.put({'dst': _img, 'mask': _mask, 'id': _id}) # wait for all the children self.log.debug('master finished {0}'.format(time.strftime("%H:%M:%S"))) show(self.Y[0]) if self.debug else None pool.close() self.log.debug('closed, in queue: {0} out: {1}'.format( in_queue.qsize(), out_queue.qsize())) in_queue.join() self.log.debug('all children finished {0}'.format( time.strftime("%H:%M:%S"))) # get the results results = sorted([ out_queue.get() for _ in xrange(big_num_tiles[0] * big_num_tiles[1]) ]) # sew them together for idx, res in results: # calculate the mask base_patch = self.Y[0][idx[0]:idx[0] + self.big_tilesize, idx[1]:idx[1] + self.big_tilesize] new_patch = res[0] mask_patch = self.calc_patch_mask(base_patch, new_patch, coord=idx, overlap=self.big_overlap) # apply the mask to each layer for i, y in enumerate(self.Y): base_patch = y[idx[0]:idx[0] + self.big_tilesize, idx[1]:idx[1] + self.big_tilesize] new_patch = res[i] self.Y[i][idx[0]:idx[0]+self.big_tilesize, idx[1]:idx[1]+self.big_tilesize, :] = \ filter_img(new_patch, base_patch, mask_patch) # apply the mask again if self.Ymask is not None: self.Y = [r * Ymask_rgb for r in self.Y] show(self.Y[0]) if self.debug else None if self.result_path: save(self.Y[0], self.result_path) self.log.info('saving' + self.result_path)
def learning(input_dict, output_rules): def worker(): def rule_parser(tagged_data): parser = nltk.RegexpParser(''' NP: {<NN|NNS|NNP|NNPS|NE>} NPs: {<NP> (<,|CC> <NP>)+} ''') return parser.parse(tagged_data) def find_hypernyms(pre_parsed_data, hypernym_list): hypernym_positions = [] for n in range(len(pre_parsed_data)): if isinstance(pre_parsed_data[n], nltk.tree.Tree): if pre_parsed_data[n].node == 'NP': if pre_parsed_data[n][0][0] in hypernym_list: hypernym_positions.append((n, pre_parsed_data[n][0][0])) return hypernym_positions def find_pattern(pre_parsed_data, hypernym, hypernym_list): left = [] right = [] start_pos = hypernym[0] def add_to_list(current_list, element, position, has_hyponym): try: if isinstance(element[position], nltk.tree.Tree): current_list.append((element[position][0][0], element[position].node)) if element[position].node == 'NP': if element[position][0][0] in hypernym_list[hypernym[1]]: has_hyponym.append(len(current_list)) elif element[position].node == 'NPs': for possible_np in element[position][0]: if isinstance(possible_np, nltk.tree.Tree): if possible_np[0][0] in hypernym_list[hypernym[1]]: has_hyponym.append(len(current_list)) else: current_list.append((element[position][0], element[position][1])) except IndexError: pass # search the right side for a pattern has_hyponym = [] for i in range(start_pos, start_pos + MAX_SEARCH_RANGE, 1): add_to_list(right, pre_parsed_data, i, has_hyponym) if has_hyponym: return 'right', right[:has_hyponym[-1]] has_hyponym = [] # search the left side for a pattern for i in range(start_pos, start_pos - MAX_SEARCH_RANGE, -1): add_to_list(left, pre_parsed_data, i, has_hyponym) if has_hyponym: left.reverse() return 'left', left[(MAX_SEARCH_RANGE - has_hyponym[-1]):] return None def add_rule(rule): if len(rule[1]) >= 3: rts = [] count = 0 for element in rule[1]: if element[1] == 'NPs' or element[1] == 'NP': if count == 0 or count == len(rule[1]) - 1: rts.append(element[1]) else: if element[1] == 'NP': rts.append(element[0]) else: for node in element[1]: rts.append(node[0]) else: rts.append(element[0]) count += 1 if rule[0] == 'left': rts[len(rule[1]) - 1] = 'HYPERNYM' else: rts[0] = 'HYPERNYM' rts_str = ' | '.join(rts) if rts_str in rules: rules[rts_str] += 1 else: rules[rts_str] = 1 while not file_list.empty(): input_file = file_list.get() tagged_data = pickle.load(open(input_file, 'rb')) pre_parsed_data = rule_parser(tagged_data) hypernym_positions = find_hypernyms(pre_parsed_data, h_dict.keys()) for hypernym in hypernym_positions: rule = find_pattern(pre_parsed_data, hypernym, h_dict) if rule: add_rule(rule) percentage = 100.0 - ((float(file_list.qsize()) / float(file_count)) * 100.0) sys.stdout.write("\rProgress: {0:.2f}%".format(percentage)) sys.stdout.flush() file_list.task_done() def blacklist_filter(rule): result = True number_blacklist = [] for i in range(100): number_blacklist.append(num2words(i).encode('ascii')) for word in (WORD_BLACKLIST + number_blacklist): if word in rule: result = False return result h_dict = pickle.load(open(input_dict, 'rb')) rules = Manager().dict() file_list = JoinableQueue() sys.stdout.write("\rProgress: 0.00%") for root, subFolders, files in os.walk(os.path.join(os.path.dirname(__file__), 'corpus', 'tagged')): for current_file in files: if current_file.endswith(".pickle"): file_list.put(os.path.join(root, current_file)) file_count = file_list.qsize() for pid in range(8): process = Process(target=worker, args=()) process.daemon = True process.start() file_list.join() print('') # filter the rules and save them filtered_rules = {k: v for k, v in rules.items() if v >= THRESHOLD and blacklist_filter(k)} rule_list = sorted(filtered_rules.keys(), key=filtered_rules.get) rule_list.reverse() pickle.dump(rule_list, open(output_rules, 'wb+'), 2)
self.outputqueue.put( self.compute(n)) if __name__ == '__main__': begintime = time.time() inputqueue = Queue() outputqueue = Queue() threads = int(sys.argv[1]) feeder = Feeder(inputqueue,outputqueue, threads) feeder.start() duration = time.time() - begintime print("Feeder started (" + str(duration) + "s)") for _ in range(0,threads): processor = Processor(inputqueue,outputqueue) processor.start() inputqueue.join() duration = time.time() - begintime print("Inputqueue done (" + str(duration) + "s)") outputqueue.put(None) print("Outputqueue length (" + str(outputqueue.qsize()) + ")") feeder.join() duration = time.time() - begintime print("Outputqueue done (" + str(duration) + "s)")
return [*file_hashes] if __name__ == '__main__': freeze_support() print(perf_counter()) with EXECUTOR(max_workers=MAX_WORKERS) as executor: in_dirs = 0 out_dirs = 1 in_list = list() out_list = [ basepath, ] hash_count = 0 sleeps = 0 while not q.empty() and q.qsize(): item = q.get() print(f'{in_dirs}: {item}') in_list.append(item) in_dirs += 1 future_dirs = executor.submit(dir_worker, item) dirs = future_dirs.result() q_dirs = [*map(q.put, dirs)] #print(dirs) out_dirs += len(dirs) out_list.extend(dirs) future_hashes = executor.submit(hash_worker, item) q_hashes = [*map(hash_q.put, dirs)] hashes = future_hashes.result() print(hashes) hash_count += len(hashes)
import nltk import os import sys import re import pickle nltk.data.path.append(os.path.join(os.path.dirname(__file__), 'nltk')) file_list = JoinableQueue() for root, subFolders, files in os.walk(os.path.join(os.path.dirname(__file__), 'corpus', 'plain')): for current_file in files: if current_file.endswith(".txt"): file_list.put(os.path.join(root, current_file)) file_count = file_list.qsize() def worker(): while not file_list.empty(): input_file = file_list.get() with open(input_file, 'r') as c_file: contents = c_file.read() contents = contents.replace(' {2,}', '') contents = re.sub('\s{2,}', ' ', contents) tokens = [word for sent in nltk.sent_tokenize(contents) for word in nltk.word_tokenize(sent)] pos_tagged = nltk.pos_tag(tokens) pos_tagged = nltk.ne_chunk(pos_tagged)
shape = image.shape f = h5py.File('/dev/shm/test.h5','w') data = f.create_dataset('data', (len(minc_volumes),) + image.shape, chunks=(CHUNKSIZE,) + image.shape, compression='gzip') names = f.create_dataset('names', (len(minc_volumes),), dtype=h5py.special_dtype(vlen=unicode)) vf.close() q = JoinableQueue() p = Process(target=enqueue_volumes, args=(q,minc_volumes)) p.start() while True: item = q.get(block=True) q.task_done() if item is None: break i, j, zeros, namelist = item print 'data[{}:{},:] = zeros[:{},:]'.format(i, i+j+1, j+1) data[i:i+j+1,:] = zeros[:j+1,:] print 'names[{}:{}] = namelist'.format(i, i+j+1) print "qsize:", q.qsize() names[i:i+j+1] = namelist f.close()
pair_buffer = {} scaffold_count = {} # while (not inq.empty()) or sum( [reader.is_alive() for reader in readers] )>0: while True: if args.debug: print("get") try: procid, scaffold, pairs = inq.get() # procid,scaffold,pairs = inq.get(True,10) #print("#got data:",procid,scaffold,len(pairs)) print("#got data from inq:", procid, scaffold, len(pairs), inq.empty(), inq.qsize(), inq.full(), strftime("%Y-%m-%d %H:%M:%S"), sum([reader.is_alive() for reader in readers]), "q.size():", q.qsize(), file=sys.stderr, sep="\t") sys.stderr.flush() sys.stdout.flush() except Exception as e: print(e, file=sys.stderr) if args.top: print("queue get timed out", [reader.cpu_percent() for reader in reader_procs], [worker.cpu_percent() for worker in worker_procs])
class ProcessPool: def __init__(self, max_number_of_processs: int, process_name_prifix: str = ''): self.logger = logging.getLogger(__name__) self._queue = JoinableQueue() self._res_queue = Queue() self._process_pool = [] self.output = [] self._lock = Lock() self._processes_count = max_number_of_processs for i in range(max_number_of_processs): self._process_pool.append( self._create_pr('{}{:02d}'.format(process_name_prifix, i))) def _create_pr(self, th_name) -> WorkerProcess: t = WorkerProcess(self._queue, self._res_queue, self._lock, name=th_name, status_log_interval=self._processes_count * 20) t.daemon = True t.start() return t @property def queue(self): return self._queue def kill_them_all(self, timeout=5): logger = logging.getLogger(__name__) logger.debug('closing all processs') for t in self._process_pool: # I'm putting none to push queue out of block self._queue.put((None, None)) logger.debug('collecting all output data from processs') self._res_queue.put(None) if sys.platform == "linux" or sys.platform == "linux2" or\ sys.platform == "win32": output_count = self._res_queue.qsize() else: output_count = None # result = self._res_queue.get() collected = 0 number_of_none_outputs = 0 none_indeces = [] while True: try: result = self._res_queue.get_nowait() collected += 1 except queue.Empty: if output_count is not None: if collected < output_count: continue else: break else: break if result is None: number_of_none_outputs += 1 none_indeces.append(collected - 1) else: self.output.append(result) logger.info('from all {} ouputs {} were collected {} ' 'of them were None. None indeces: {}'.format( output_count, len(self.output), number_of_none_outputs, none_indeces)) logger.debug('data were collected waiting for processses to join') for t in self._process_pool: t.join(timeout) logger.debug('Processses joined successfully - now closing them all') for t in self._process_pool: try: t.close() except ValueError as err: logger.error(err, exc_info=True) logger.info('Closing the process was not seuccessful.' ' I will terminate it') t.terminate() logger.debug('processs all closed') def get_status(self) -> str: stat = '' input_qsz = self._queue.qsize() output_qsz = self._res_queue.qsize() stat += ("{} processes with inputs ({}) and outputs({})\n".format( len(self._process_pool), input_qsz, output_qsz)) for ps in self._process_pool: name = ps.name st = 'alive' if ps.is_alive() else 'dead' stat += ('\t\tps {} is {}\n'.format(name, st)) return stat
def bulk(bucket_name, prefix, concurrency, glob_load, resume_load, dest_dataset=None, alias=None, cluster_by=(), drop=(), rename={}, replace=()): """ Load data into BigQuery concurrently Args: bucket_name: gcs bucket name (str) prefix: object key path, 'dataset/version' (str) concurrency: number of processes to handle the load (int) glob_load: load data by globbing path dirs (boolean) resume_load: resume load (boolean) dest_dataset: override default dataset location (str) alias: override object key derived table name (str) cluster_by: top level fields to cluster by (Tuple[str]) drop: top level fields to exclude (Tuple[str]) rename: top level fields to rename (Dict[str,str]) replace: top field replacement expressions (Tuple[str]) """ _dest_dataset = dest_dataset or DEFAULT_DATASET logging.info('main_process: dataset set to {}'.format(_dest_dataset)) q = JoinableQueue() msg = Queue() lock = Lock() if glob_load: logging.info('main_process: loading via glob method') object_keys = get_latest_object(bucket_name, prefix) if resume_load: object_keys = remove_loaded_objects(object_keys, _dest_dataset, alias, rename) for path, object_key in object_keys.items(): q.put((bucket_name, path, object_key)) else: logging.info('main_process: loading via non-glob method') object_keys = list_blobs_with_prefix(bucket_name, prefix) for object_key in object_keys: q.put((bucket_name, None, object_key)) args = (lock, q, msg, _dest_dataset, alias, cluster_by, drop, rename, replace) for c in range(concurrency): p = Process(target=_bulk_run, args=(c, ) + args) p.daemon = True p.start() logging.info('main_process: {} total tasks in queue'.format(q.qsize())) q.join() for c in range(concurrency): q.put(None) p.join() # if the msg queue is not empty it indicates that an exception occured in # the child process. exit 1 to indicate a failure event. # we only send error type messages right now but in the future we could # potentially have others if msg.empty(): logging.info('main_process: done') exit(0) else: logging.error('main_process: exceptions occured in child processes') while not msg.empty(): error_msg = msg.get() logging.error('main_process: {} had error {}'.format( error_msg[1], error_msg[2])) exit(1)
# if status_code == 200: # break # else: # proxy = 'https://%s' % ip_port # else: # print(os.getpid(), 'all down') # continue # print(os.getpid(), 'ok:', proxy) # break if __name__ == '__main__': # company_link = 'https://www.lagou.com/jobs/{pos_id}.html'.format(pos_id=3893733) # response = lagou_detail(company_link) # print(response.text) # lagou() # 183.233.89.222 from multiprocessing import Pool from multiprocessing import Process from multiprocessing import JoinableQueue queue = JoinableQueue() # pool = Pool(processes=4) for i in range(4): Process(target=proxy_pool, args=(queue, )).start() # pool.apply_async(proxy_pool, args=(queue,)) # pool.close() # pool.join() while True: print('queue =', queue.qsize()) time.sleep(5) print('ending')
def _parallelly_make_dataset(self): import multiprocessing from multiprocessing import Process from multiprocessing import JoinableQueue as Queue name_file = '{}/video_list.npy'.format(self.loc) len_file = '{}/video_lengths.npy'.format(self.loc) if isfile(name_file): video_list = np.load(name_file) video_lengths = np.load(len_file) return video_list, video_lengths q = Queue() qvideo_list = Queue() fnames_list = [] for root, _, fnames in tqdm(os.walk(self.root)): for fname in sorted(fnames): fnames_list.append(os.path.join(root, fname)) def parallel_worker(fnames_chunk): item = q.get() for fname in tqdm(fnames_chunk): if has_file_allowed_extension(fname, VIDEO_EXTENSION): video_path = fname vc = cv2.VideoCapture(video_path) length = int(vc.get(cv2.CAP_PROP_FRAME_COUNT)) if length > 0 and vc.isOpened(): qvideo_list.put((video_path, length)) qvideo_list.task_done() vc.release() q.task_done() processes = 32 n = len(fnames_list) chunk = int(n / processes) if chunk == 0: chunk = 1 fnames_chunks = [fnames_list[i*chunk:(i+1)*chunk] \ for i in range((n + chunk - 1) // chunk)] for i in range(processes): q.put(i) multiprocessing.Process(target=parallel_worker, args=(fnames_chunks[i], )).start() q.join() qvideo_list.join() video_list = [] video_lengths = [] while qvideo_list.qsize() != 0: video, length = qvideo_list.get() video_list.append(video) video_lengths.append(length) np.save(name_file, video_list) np.save(len_file, video_lengths) return video_list, video_lengths
print("Getting list of connected research articles") article_PMIDs = r.smembers('linked_articles') article_URLs = r.mget(['{0}:URL'.format(PMID) for PMID in article_PMIDs]) uncatalogged = Queue() for args in zip(article_URLs, article_PMIDs): uncatalogged.put(args) if DEBUG: # Run one process print("DEBUG: Running single threaded.") parallel_worker() else: print("Starting pool") NUM_WORKERS = 7 pool = Pool(NUM_WORKERS) results = [pool.apply_async(parallel_worker) for i in range(NUM_WORKERS)] print("Running progress capture.") while (True): remaining = uncatalogged.qsize() print "Waiting for", remaining, "tasks to complete..." time.sleep(0.5) # print [result.get() for result in results] uncatalogged.join() print 'Done'
conn.row_factory = sqlite3.Row vids = load_vendor_ids('vendorids.txt') #vids = {'04f9':1, '0424':1} vids_done = {} #get_completed_vids() request_queue = JoinableQueue() result_queue = JoinableQueue() [ request_queue.put((vid, prepare_driver_req(vid))) for vid in vids if vid not in vids_done ] print 'total vids: %d, vids done: %d, vids remaining: %d' % ( len(vids), len(vids_done), request_queue.qsize()) pool = Pool(4, request_worker, (request_queue, )) pool.close() while True: try: vid, current_page, total_pages, drivers = result_queue.get(True) except Exception as e: traceback.print_exc() break if drivers: print 'saving %s drivers for %s' % (len(drivers), vid) save_drivers(drivers, vid) else:
class Crawler(): def __init__(self): self.db = Mongo() self.cdb = DbClient() self.page = None self.session = None self.set_session() self.search_url_Queue = JoinableQueue() def set_session(self): s = requests.session() s.cookies.update(self.get_cookie()) s.headers.update(HEADERS) self.session = s def get_cookie(self): # 获取不为空的cookie while True: q = self.cdb.get_cookies(flag=1) if q == None: print('时间等待') time.sleep(10) continue else: d = {} if q: self.user = q['user'] cookies = q['cookies'] for cookie in cookies: d[cookie.get('name')] = cookie.get('value') return d def get_page(self, url): url = url #r = self.session.get(url, headers=HEADERS, cookies=self.get_cookie()) r = self.session.get(url, timeout=(10, 15)) if r.text.find('亲,小二正忙,滑动一下马上回来') > 0: print("cookie需要验证!!!") self.cdb.update_cookie_flag2(self.user) return False if r.text.find('请输入') > 0: print("Need Login!!!") self.cdb.update_cookie_flag0(self.user) return False self.page = r.text self.parse() time.sleep(4) return True def parse(self): pattern = re.compile(r'g_page_config = ({.*});') m = re.search(pattern, self.page) if not m: print('Cannot fount data in this page.') with open('log_page.txt', 'w') as f: f.write(self.page) return False g_page_config = json.loads(m.group(1)) auctions = g_page_config['mods']['itemlist']['data']['auctions'] for auction in auctions: try: simil_url_short = auction.get('i2iTags', { "samestyle": '/' }).get('samestyle', {"url", '/'}).get('url', '') except Exception as e: simil_url_short = '' d = {} d['keyword'] = KEYWORD d['t_link'] = 'https:' + auction.get('detail_url', '/') d['title'] = auction.get('raw_title') d['price'] = auction.get('view_price') d['shop_name'] = auction.get('nick') d['sales_num'] = auction.get('view_sales', '0').replace('人收货', '').replace('人付款', '') d['simil_url_short'] = simil_url_short d['flag'] = 0 print(d.get('keyword'), d.get('title'), d.get('simil_url_short')) self.db.insert(d) def run_cry(self): while True: print('【{}实时展示需要-请求-的原商品-链接】', self.search_url_Queue.qsize()) search_url = self.search_url_Queue.get() # 获得搜寻数据 print('Crawling page {}'.format(search_url)) flag = self.get_page(url=search_url) self.search_url_Queue.task_done() def run(self): for i in range(1, 4): page = str(i * 44) url = 'https://s.taobao.com/search?q=' + KEYWORD + '&sort=sale-desc&s=' + page print('搜索的初始url', url) self.search_url_Queue.put(url) Thread_list = [] for i in range(1): Tsearch_page = threading.Thread(target=self.run_cry, args=()) Thread_list.append(Tsearch_page) for p in Thread_list: p.daemon = True p.start() for all in [ self.search_url_Queue, self.parse_data_search_shop_Queue, self.data_search_shop_Queue, self.parse_data_simil_shop_Queue, self.data_simil_shop_Queue, ]: all.join()
histogram_merge_worker.start() if args.top: reader_procs = [ psutil.Process(reader.pid) for reader in readers ] worker_procs = [ psutil.Process(worker.pid) for worker in workers ] pair_buffer={} scaffold_count={} # while (not inq.empty()) or sum( [reader.is_alive() for reader in readers] )>0: while True: if args.debug: print("get") try: procid,scaffold,pairs = inq.get() # procid,scaffold,pairs = inq.get(True,10) #print("#got data:",procid,scaffold,len(pairs)) print("#got data from inq:",procid,scaffold,len(pairs),inq.empty(),inq.qsize(),inq.full(),strftime("%Y-%m-%d %H:%M:%S"),sum( [reader.is_alive() for reader in readers] ),"q.size():",q.qsize(),file=sys.stderr,sep="\t") sys.stderr.flush() sys.stdout.flush() except Exception as e: print(e,file=sys.stderr) if args.top: print("queue get timed out",[reader.cpu_percent() for reader in reader_procs],[worker.cpu_percent() for worker in worker_procs]) #print("#timed out",inq.empty()) print("#read from queue timed out:",inq.empty(),inq.qsize(),inq.full(),strftime("%Y-%m-%d %H:%M:%S"),sum( [reader.is_alive() for reader in readers] ),file=sys.stderr,sep="\t") sys.stderr.flush() continue if args.debug: print("got") if not scaffold in pair_buffer: pair_buffer[scaffold]=[] pair_buffer[scaffold] += pairs scaffold_count[scaffold] = scaffold_count.get(scaffold,0)+1