def trainerLearnScoreParallel(lrLearner, svmLearner, knnLearner): manager = Manager() lrScore, svmScore, knnScore = manager.Value('d', 0.0), manager.Value('d', 0.0), manager.Value('d', 0.0) temp = manager.Namespace() temp.learner = lrLearner lrLearner = temp temp = manager.Namespace() temp.learner = svmLearner svmLearner = temp temp = manager.Namespace() temp.learner = knnLearner knnLearner = temp lrP = Process(target=trainerLearnScore, args=(lrLearner, 'LogReg', finalFeatures, finalAnswers, testFeatures, testAnswers, lrScore)) svmP = Process(target=trainerLearnScore, args=(svmLearner, 'SVM', finalFeatures, finalAnswers, testFeatures, testAnswers, svmScore)) knnP = Process(target=trainerLearnScore, args=(knnLearner, 'kNN', finalFeatures, finalAnswers, testFeatures, testAnswers, knnScore)) lrP.start() svmP.start() knnP.start() lrP.join() svmP.join() knnP.join() lrLearner = lrLearner.learner svmLearner = svmLearner.learner knnLearner = knnLearner.learner return (lrLearner, svmLearner, knnLearner, lrScore, svmScore, knnScore)
def make_context(): manager = Manager() context = { 'status': manager.Namespace(), 'configLock': manager.Lock(), 'streams': manager.Namespace(), 'filecopy': manager.Namespace(), 'logs': manager.Queue() } return context
def _main(*, input_files, clean, preprocess_args=None, plot_args, output_path): if clean: shutil.rmtree(output_path, ignore_errors=True) ensure_directory(output_path) dfs = [pd.read_parquet(f.format(**os.environ)) for f in input_files] df = pd.concat(dfs) if preprocess_args: df = preprocess(df, **preprocess_args) print(df.head()) for col in df.columns: if df[col].nunique() < 20: values = ', '.join(map(str, df[col].unique())) print(f'{col} has the values: {values}') plot_args = list(expand(df, plot_args)) plot_args = [{**pa, 'output_path': output_path} for pa in plot_args] print(f"Make {len(plot_args)} plots.") mgr = Manager() ns = mgr.Namespace() ns.df = df pool = Pool(20) data_args = list(zip(plot_args, [ns] * len(plot_args))) pool.map(_plot, data_args)
def test_call_aws_cli_called(mock_aws): """ Test that the aws cli is called since the object storage class is standard """ scripts.aws_replicate.logger = MagicMock() subprocess.Popen = MagicMock() utils.get_aws_bucket_name = MagicMock() mock_aws.return_value = "tcga-open" scripts.aws_replicate.bucket_exists = MagicMock() scripts.aws_replicate.bucket_exists.return_value = True scripts.aws_replicate.object_exists = MagicMock() scripts.aws_replicate.object_exists.return_value = True scripts.aws_replicate.get_object_storage_class = MagicMock() boto3.session.Session = MagicMock() scripts.aws_replicate.get_object_storage_class.return_value = "STANDARD" manager = Manager() manager_ns = manager.Namespace() manager_ns.total_processed_files = 0 manager_ns.total_copied_data = 0 lock = manager.Lock() job_info = scripts.aws_replicate.JobInfo({}, gen_mock_manifest_data()[0], 1, 1, "", {}, {}, manager_ns, "bucket") scripts.aws_replicate.exec_aws_copy(lock, False, job_info) assert subprocess.Popen.call_count == 1
def multi_run(producer,consumer,pool_cnt=20,producer_cnt=10,consumer_cnt=10): global token global last_down_w global finished manager = Manager() q1 = manager.Queue(10000) ws = [i for i in range(1, int(no_of_images)+1, 100)] for i in ws: q1.put(i) last_down_w = ws[-1] q = manager.Queue(10000) p = Pool(pool_cnt) nm = manager.Namespace() nm.running = True for i in xrange(producer_cnt): pw = p.apply_async(producer,args=(q1,q,nm,)) print 'new producer' sys.stdout.flush() time.sleep(0.1) for i in xrange(consumer_cnt): p.apply_async(consumer,args=(q1,q,nm,)) print 'new consumer' sys.stdout.flush() p.close() p.join()
def _random(self, random_changes=None, random_upgrades=None, tries=100): manager = Manager() glob = manager.Namespace() # start = time.time() glob.best_grid = self.copy() glob.best_score = glob.best_grid.score() lock = Lock() def f(i, glob, lock): start_grid = self.copy() start_grid._random_upgrade(random_upgrades) # start_grid._random_change(random_changes) # start_grid.brute_upgrade() score = start_grid.score() with lock: if score > glob.best_score: print(i, score) glob.best_grid = start_grid.copy() glob.best_score = score return ps = [Process(target=f, args=(i, glob, lock)) for i in range(tries)] [p.start() for p in ps] [p.join() for p in ps] # try: # except KeyboardInterrupt: # pass self = glob.best_grid # print("%d : %d : %.2fs" % (i, score, (time.time()-start))) return self
def __init__(self) -> None: self.Xd_shared = Array('d',[0,0,0]) self.dXd_shared = Array('d',[0,0,0]) # self.dXd_shared = [0,0,0] self.force = Value('d',0.0) mgr = Manager() self.namespace = mgr.Namespace() self.namespace.my_list = []
def scrape_recommendations(client_id, client_secret): manager = Manager() namespace = manager.Namespace() namespace.credentials_manager = SpotifyClientCredentials(client_id=client_id, client_secret=client_secret) sp = spotipy.Spotify(client_credentials_manager=namespace.credentials_manager) genres = sp.recommendation_genre_seeds()["genres"] worker = partial(scraping_worker, namespace) with Pool(processes=len(genres)) as pool: pool.map(worker, genres)
def __init__(self): process_manager = Manager() self._shared_data = process_manager.Namespace() self._shared_data.alive = True self._queue = Queue() self._worker = Process(target=input_worker, args=(self._shared_data, self._queue)) self._worker.start()
def main(): processes = [] manager = Manager() shared_namespace = manager.Namespace() shared_namespace.hashes = [] for i in range(NB_POOLS): proc = Process(target=process_job, args=(i, shared_namespace)) proc.start() processes.append(proc) for proc in processes: proc.join()
def run_downloader(db_connection, logger): manager = Manager() shared_state = manager.Namespace() shared_state.default_date = get_default_date() shared_state.past_date = None job_serializer = Serializer(db_connection, job) # TODO: Check if today's job is already running. # If so, just go with missed jobs. logger.info('Starting the main workflow') p = start_main_job(shared_state) if environ.get('JUST_MAIN', False): p.join() return end_time = datetime.now()\ .replace(hour=23, minute=30, second=0, microsecond=0) while True: sleep(5) p.join(timeout=0) if not p.is_alive(): job_id = shared_state.job_id completed = shared_state.completed if job_id is not None and not completed: job_serializer.put(job_id, { 'status': JobStatus.FAILED, }) if p.exitcode != 0: logger.error('Job exited unexpectedly', f'Exit code: {p.exitcode}\nJob id: {job_id}') if datetime.now() >= end_time: break p = start_past_job(shared_state) if p is None: break elif datetime.now() >= end_time: job_id = shared_state.job_id completed = shared_state.completed if job_id is not None and not completed: # Time to end. p.terminate() job_serializer.put(job_id, { 'status': JobStatus.FAILED, }) logger.info('All jobs finished.')
def trainOnSet(hyperParams, examples, theta, histGrad, cores): try: fixWords = hyperParams['fixEmb'] except: fixWords = False try: fixWeights = hyperParams['fixW'] except: fixWeights = False # print 'fixEmb:',fixWords, ',fixW:', fixWeights adagrad = hyperParams['ada'] mgr = Manager() ns= mgr.Namespace() ns.lamb = hyperParams['lambda'] batchsize = hyperParams['bSize'] random.shuffle(examples) # randomly split the data into parts of batchsize avErrors = [] for batch in xrange((len(examples)+batchsize-1)//batchsize): ns.theta = theta minibatch = examples[batch*batchsize:(batch+1)*batchsize] s = (len(minibatch)+cores-1)//cores trainPs = [] q = Queue() if cores<2: trainBatch(ns, minibatch,q, fixWords,fixWeights) #don't start a subprocess trainPs.append('') # But do put a placeholder in the queue else: for j in xrange(cores): p = Process(name='minibatch'+str(batch)+'-'+str(j), target=trainBatch, args=(ns, minibatch[j*s:(j+1)*s],q,fixWords,fixWeights)) trainPs.append(p) p.start() errors = [] theta.regularize(hyperParams['alpha']/len(examples), hyperParams['lambda']) for j in xrange(len(trainPs)): (grad, error) = q.get() if grad is None: continue theta.add2Theta(grad,hyperParams['alpha'],histGrad) errors.append(error) # make sure all worker processes have finished and are killed if cores>1: for p in trainPs: p.join() try: avError = sum(errors)/len(errors) except: avError = 0 print 'batch size zero!' if batch % 25 == 0: print '\t\tBatch', batch, ', average error:',avError , ', theta norm:', theta.norm() avErrors.append(avError) return sum(avErrors)/len(avErrors)
def __init__(self): self.i3 = i3ipc.Connection() self._actived = False self._win_rect = (0, 0, 0, 0) process_manager = Manager() self._shared_data = process_manager.Namespace() self._shared_data.alive = True self._shared_data.actived = False self._lock = Lock() self._worker = Process(target=info_worker, args=(self._lock, self._shared_data)) self._worker.start()
class ParallelContext: """Holds the objects needed to coordinate parallelism.""" def __init__(self, spawner: GeneSpawner, evaluator: Evaluator, n_proc: Optional[int] = None): self.manager = Manager() self.ns = self.manager.Namespace() self.ns.spawner = spawner self.ns.evaluator = evaluator if n_proc is None: self.pool = Pool() else: self.pool = Pool(n_proc)
def main(): final_result = [] args = cmd_args_parser() case_id = int(args.case_num[3:]) prefix = args.case_num[:3] case_type_filter = None if hasattr(args, "case_type_filter"): case_type_filter = args.case_type_filter lock = Lock() jobs = [] mgr = Manager() ns = mgr.Namespace() ns.df = final_result start = case_id - args.batch end = case_id + args.batch total_num = end - start + 1 if total_num > 20: batch_result = get_batch_pair(total_num, start, end) for i in range(len(batch_result)): p = multiprocessing.Process(target=query_website, args=( ns, batch_result[i], prefix, case_type_filter, lock, args.verbose, )) jobs.append(p) p.start() for job in jobs: job.join() final_result = ns.df else: for i in range(start, end): result = get_result(i, prefix, case_type_filter, args.verbose) if bool(result): final_result.append(result) json_type = json.dumps(final_result, indent=4) now = datetime.datetime.now() with open("data-%s.yml" % now.strftime("%Y-%m-%d"), "w") as outfile: yaml.dump(yaml.load(json_type), outfile, allow_unicode=True) print yaml.dump(yaml.load(json_type), allow_unicode=True, width=256)
def findTrainerErrorParallel(lrLearner, svmLearner, knnLearner, finalFeatures, finalAnswers, testFeatures, testAnswers): manager = Manager() lrTrainingError, lrTestingError, lrIndices = manager.list(), manager.list(), manager.list() svmTrainingError, svmTestingError, svmIndices = manager.list(), manager.list(), manager.list() knnTrainingError, knnTestingError, knnIndices = manager.list(), manager.list(), manager.list() temp = manager.Namespace() temp.learner = lrLearner lrLearner = temp temp = manager.Namespace() temp.learner = svmLearner svmLearner = temp temp = manager.Namespace() temp.learner = knnLearner knnLearner = temp lrP = Process(target=findTrainerError, args=(lrLearner, 'LogReg', finalFeatures, finalAnswers, testFeatures, testAnswers, lrTrainingError, lrTestingError, lrIndices)) svmP = Process(target=findTrainerError, args=(svmLearner, 'SVM', finalFeatures, finalAnswers, testFeatures, testAnswers, svmTrainingError, svmTestingError, svmIndices)) knnP = Process(target=findTrainerError, args=(knnLearner, 'kNN', finalFeatures, finalAnswers, testFeatures, testAnswers, knnTrainingError, knnTestingError, knnIndices)) lrP.start() svmP.start() knnP.start() lrP.join() svmP.join() knnP.join() lrLearner = lrLearner.learner svmLearner = svmLearner.learner knnLearner = knnLearner.learner lrTrainingError, lrTestingError, lrIndices = np.asarray(lrTrainingError), np.asarray(lrTestingError), np.asarray(lrIndices) svmTrainingError, svmTestingError, svmIndices = np.asarray(svmTrainingError), np.asarray(svmTestingError), np.asarray(svmIndices) knnTrainingError, knnTestingError, knnIndices = np.asarray(knnTrainingError), np.asarray(knnTestingError), np.asarray(knnIndices) return (lrLearner, svmLearner, knnLearner, lrTrainingError, lrTestingError, lrIndices, svmTrainingError, svmTestingError, svmIndices, knnTrainingError, knnTestingError, knnIndices)
def main(): final_result = [] reminder_result = [] args = cmdArgumentParser() case_numberic = int(args.case_num[3:]) prefix = args.case_num[:3] lock = Lock() jobs = [] mgr = Manager() ns = mgr.Namespace() ns.df = final_result start = case_numberic - args.batch end = case_numberic + args.batch total_num = end - start + 1 rmnder = total_num % CPU_CORES if total_num > 20: batch_result = get_batch_pair(total_num, start, end) for i in range(len(batch_result)): p = multiprocessing.Process(target=query_website, args=( ns, batch_result[i], prefix, lock, args.verbose, )) jobs.append(p) p.start() for job in jobs: job.join() final_result = ns.df # for i in range(end - rmnder + 1,end): # reminder_result.append(get_result(i,prefix)) else: for i in range(start, end): final_result.append(get_result(i, prefix, args.verbose)) json_type = json.dumps(final_result, indent=4) now = datetime.datetime.now() with open('data-%s.yml' % now.strftime("%Y-%m-%d"), 'w') as outfile: yaml.dump(yaml.load(json_type), outfile, allow_unicode=True) print yaml.dump(yaml.load(json_type), allow_unicode=True)
def main(): """Gets input hash and salt from command line, spawns worker threads, initializes queue, calls worker function""" # Check user input if len(sys.argv) != 2: sys.exit("Usage: python crack.py hash") # Extract hash and salt from user input user_hash = sys.argv[1] salt = user_hash[:2] # Input lists inputs = [HASH_INPUTS_1, HASH_INPUTS_2, HASH_INPUTS_3, HASH_INPUTS_4] # Manager to create shared Namespace mgr = Manager() namespace = mgr.Namespace() # Found to catch found event on threads namespace.found = False # The function to crack the DES hashed password. # It is the callback function to main's calls to set_tuple_permutations # via pool.apply_async. When the pool of child processes return results, # the main process will process them via the callback function DES_crack. # Spawn pool of child processes calling the worker # function: worker_crack in parallel and # applying async, with callback function: print_n_terminate # to the result returned from the workers. start_time = time.time() pool = Pool(NUM_PHYS_CORES) for aninput in inputs: pool.apply_async(worker_crack, args=(aninput, user_hash, salt, namespace), callback=print_n_terminate) pool.close() pool.join() duration = time.time() - start_time print(f"Cracking password took {duration:.2f} seconds") # Success print('success - exiting') sys.exit(0)
def main(): # Create a multiprocessing manager to use as the token store global tokens, refresh_lock manager = Manager() tokens = manager.Namespace() refresh_lock = manager.Lock() # Authenticate in master process oauth2, tokens.access, tokens.refresh = authenticate(CooperativelyManagedOAuth2) # Create 2 worker processes and wait on them to finish workers = [] for _ in range(2): worker_process = Process(target=worker) worker_process.start() workers.append(worker_process) for worker_process in workers: worker_process.join()
def iter_latest_asynchonously(gen_func, timeout=None, empty_value=None, use_forkserver=False, uninitialized_wait=None): """ Given a generator function, make an iterator that pulls the latest value yielded when running it asynchronously. If a value has never been set, or timeout is exceeded, yield empty_value instead. :param gen_func: A generator function (a function returning a generator); :return: """ if use_forkserver: from multiprocessing import set_start_method # Only Python 3.X set_start_method( 'forkserver' ) # On macos this is necessary to start camera in separate thread m = Manager() namespace = m.Namespace() lock = Lock() with lock: namespace.time_and_data = (-float('inf'), Uninitialized) p = Process(target=_async_value_setter, args=(gen_func, namespace, lock)) p.start() while True: with lock: lasttime, item = namespace.time_and_data if item is PoisonPill: # The generator has terminated break elif item is Uninitialized: if uninitialized_wait is not None: time.sleep(uninitialized_wait) continue else: yield empty_value elif timeout is not None and (time.time( ) - lasttime) > timeout: # Nothing written or nothing recent enough yield empty_value else: yield item
def start(self): p = current_process() print('parent id: {}'.format(id(p))) manager = Manager() ns = manager.Namespace() ns.a = 1 lst_proxy = manager.list() lst_proxy.append(1) dct_proxy = manager.dict() dct_proxy['b'] = 2 print(ns.a) print(lst_proxy) print(dct_proxy) p = Process(target=self.modify, args=(ns, lst_proxy, dct_proxy)) p.start() p.join() print(ns.a) print(lst_proxy) print(dct_proxy)
def __init__(self): process_manager = Manager() self._shared_data = process_manager.Namespace() self._shared_data.alive = True self._shared_data.DEFAULT_MOUSE = [ uinput.BTN_LEFT, uinput.BTN_RIGHT, uinput.BTN_MIDDLE, uinput.REL_X, uinput.REL_Y, # TODO detect full screen size uinput.ABS_X + (0, 1920, 0, 0), uinput.ABS_Y + (0, 1080, 0, 0), ] self._queue = Queue() self._worker = Process(target=input_worker, args=(self._shared_data, self._queue)) self._worker.start()
def main(): """Gets input hash and salt from command line, spawns worker processes, initializes Sync_Manager, calls worker function""" # Check user input if len(sys.argv) != 2: sys.exit("Usage: python crack.py hash") # Extract hash and salt from user input user_hash = sys.argv[1] salt = user_hash[:2] # Input lists inputs = [HASH_INPUTS_1, HASH_INPUTS_2, HASH_INPUTS_3, HASH_INPUTS_4] # Manager to create shared Namespace mgr = Manager() namespace = mgr.Namespace() # Found to catch found event on threads namespace.found = False # Spawn processes calling the worker start_time = time.time() processes = [] for i in range(len(inputs)): process = Process(target=worker_crack_password, args=(inputs[i], user_hash, salt, namespace), daemon=True) processes.append(process) process.start() for process in processes: process.join() duration = time.time() - start_time print( f"Cracking password took {duration:.2f} seconds using mp.Pool.apply_async()" ) # Success sys.exit(0)
def run(thread_num, global_config, job_name, manifest_file, bucket=None): """ start threads and log after they finish """ tasks, _ = prepare_data(manifest_file, global_config) manager = Manager() manager_ns = manager.Namespace() manager_ns.total_processed_files = 0 jobInfos = [] for task in tasks: job = JobInfo(global_config, task, len(tasks), job_name, {}, manager_ns, bucket) jobInfos.append(job) # Make the Pool of workers pool = Pool(thread_num) results = [] if job_name == "copying": results = pool.map(exec_google_copy, jobInfos) elif job_name == "indexing": results = pool.map(check_and_index_the_data, jobInfos) # close the pool and wait for the work to finish pool.close() pool.join() filename = global_config.get("log_file", "{}_log.json".format(job_name)) timestr = time.strftime("%Y%m%d-%H%M%S") filename = timestr + "_" + filename if job_name == "copying": results = [{"data": results}] json_log = {} for result in results: json_log.update(result)
class ParallelMgr(): def __init__(self, **kwargs): num_pes = kwargs.get('num_pes', 1) # TODO: compare num_pes to multiprocessing.cpu_count() # manager for manager-to-worker communication & shared namespace self.mpMgr = Manager() self.namespace = self.mpMgr.Namespace() # need a copy of num_pes self.namespace.num_pes = num_pes # manager for manager-to-worker communication self.namespace.masterList = [self.mpMgr.list() for i in range(num_pes)] # self.masterList = [ self.commMgr.list() for i in range(self.num_pes) ] # separate lists for direct PE-to-PE communication self.namespace.msgLists = [self.mpMgr.list() for i in range(num_pes)] # self.msgLists = [ self.commMgr.list() for i in range(self.num_pes) ] print('ParallelMgr init') sys.stdout.flush() def runWorkers(self, workerModule): # self.peList = [ workerModule(i,self.namespace) for i in range(self.namespace.num_pes) ] self.peList = [] for i in range(self.namespace.num_pes): commMgr = CommMgr(tid=i, namespace=self.namespace) pid = workerModule(i, commMgr) self.peList.append(pid) for pe in self.peList: print('starting worker ' + str(pe)) pe.start() # TODO: check for errors def finalize(self): print('waiting for workers to finish') for pe in self.peList: print('worker ' + str(pe) + ' is_alive=' + str(pe.is_alive())) for pe in self.peList: pe.join()
class Test(object): def __init__(self): self.manager = Manager() self.namespace = self.manager.Namespace() self.process_list = [] self.namespace.d=self.manager.dict() def print_me(self, ns, i): print("\nBefore:{0}, {1}, {2} ".format(current_process().name, i, ns.d)) temp = i * i ns.d[i] = temp print("\nAfter:{0}, {1}, {2} ".format(current_process().name, i, ns.d)) time.sleep(5) return def spawn_processes(self): for i in range(5): p = Process(target=self.print_me, args=(self.namespace, i,)) self.process_list.append(p) p.start() for i in self.process_list: i.join() print(self.namespace.d) def spawn_processes22(self): for i in range(5): p = Process(target=self.print_me, args=(self.namespace, i,)) self.process_list.append(p) p.start() print("exitcode is:{0}".format(p.exitcode)) while len(self.process_list) is not 0: for i in self.process_list: if not i.is_alive(): print("exitcode is:{0}".format(i.exitcode)) print("process:{0} is done, exit code:{1}".format(i.name, i.exitcode)); self.process_list.remove(i) i.terminate() print(self.namespace.d)
def get_prices(self): mgr = Manager() ns = mgr.Namespace() ns.prices = pd.DataFrame() d = mgr.dict() filenames = os.listdir(self.prices_dir) count = len(filenames) jobs = [] for filename in tqdm(filenames): job = Process(target=self.worker, args=(ns, filename)) job.start() jobs.append(job) for job in jobs: job.join() prices = ns.prices #print(prices.head()) self.prices = prices
def upload_parts(self): args_list = [] if OS_WINDOWS: self.ns = FakeNamespace() else: manager = Manager() self.ns = manager.Namespace() self.ns.completed = 0 part_amount = int(math.ceil(self.file_size / float(self.part_size))) self.total_parts = part_amount self.pbar = ProgressBar(widgets=[Percentage(), Bar()], maxval=self.total_parts).start() try: for i in xrange(part_amount): offset = i * self.part_size remaining_bytes = self.file_size - offset bytes = min(remaining_bytes, self.part_size) if not self.multiparts.uploaded(i + 1): args_list.append([ self.file_path, offset, bytes, self.url, self.upload_id, i + 1, self.headers, self.verify, self.pbar, self.ns ]) else: self.total_parts -= 1 if self.total_parts == 0: return self.pbar.maxval = self.total_parts pool = Pool(processes=self.processes) pool.map_async(upload_multipart_wrapper, args_list).get(9999999) pool.close() pool.join() except KeyboardInterrupt: log.error("Caught KeyboardInterrupt, terminating workers") pool.terminate() pool.join() raise Exception("Process canceled by user")
def test_call_streamming_method_called(mock_aws): """ Test that the streamming method is called since the object is Glacier """ scripts.aws_replicate.logger = MagicMock() subprocess.Popen = MagicMock() scripts.aws_replicate.stream_object_from_gdc_api = MagicMock() mock_aws.return_value = "tcga-open" scripts.aws_replicate.bucket_exists = MagicMock() scripts.aws_replicate.bucket_exists.return_value = True scripts.aws_replicate.object_exists = MagicMock() scripts.aws_replicate.object_exists.return_value = False source_objects = {"11111111111111111/abc.bam": {"StorageClass": "GLACIER"}} copied_objects = {} manager = Manager() manager_ns = manager.Namespace() manager_ns.total_processed_files = 0 manager_ns.total_copied_data = 0 lock = manager.Lock() job_info = scripts.aws_replicate.JobInfo( {}, gen_mock_manifest_data()[0], 1, 1, "", copied_objects, source_objects, manager_ns, "bucket", ) scripts.aws_replicate.exec_aws_copy(lock, False, job_info) assert subprocess.Popen.call_count == 0 assert scripts.aws_replicate.stream_object_from_gdc_api.call_count == 1
class Pipeline: def __init__(self, granule_loader: GranuleLoader, slicer: TileSlicer, data_store_factory, metadata_store_factory, tile_processors: List[TileProcessor], max_concurrency: int): self._granule_loader = granule_loader self._tile_processors = tile_processors self._slicer = slicer self._data_store_factory = data_store_factory self._metadata_store_factory = metadata_store_factory self._max_concurrency = max_concurrency # Create a SyncManager so that we can to communicate exceptions from the # worker processes back to the main process. self._manager = Manager() def __del__(self): self._manager.shutdown() @classmethod def from_string(cls, config_str: str, data_store_factory, metadata_store_factory, max_concurrency: int = 16): try: config = yaml.load(config_str, yaml.FullLoader) cls._validate_config(config) return cls._build_pipeline(config, data_store_factory, metadata_store_factory, processor_module_mappings, max_concurrency) except yaml.scanner.ScannerError: raise PipelineBuildingError("Cannot build pipeline because of a syntax error in the YAML.") # TODO: this method should validate the config against an actual schema definition @staticmethod def _validate_config(config: dict): if type(config) is not dict: raise PipelineBuildingError("Cannot build pipeline; the pipeline configuration that " + "was received is not valid YAML.") @classmethod def _build_pipeline(cls, config: dict, data_store_factory, metadata_store_factory, module_mappings: dict, max_concurrency: int): try: granule_loader = GranuleLoader(**config['granule']) slicer_config = config['slicer'] slicer = cls._parse_module(slicer_config, module_mappings) tile_processors = [] for processor_config in config['processors']: module = cls._parse_module(processor_config, module_mappings) tile_processors.append(module) return cls(granule_loader, slicer, data_store_factory, metadata_store_factory, tile_processors, max_concurrency) except PipelineBuildingError: raise except KeyError as e: raise PipelineBuildingError(f"Cannot build pipeline because {e} is missing from the YAML.") except Exception as e: logger.exception(e) raise PipelineBuildingError(f"Cannot build pipeline because of the following error: {e}") @classmethod def _parse_module(cls, module_config: dict, module_mappings: dict): module_name = module_config.pop('name') try: module_class = module_mappings[module_name] logger.debug("Loaded processor {}.".format(module_class)) processor_module = module_class(**module_config) except KeyError: raise PipelineBuildingError(f"'{module_name}' is not a valid processor.") except Exception as e: raise PipelineBuildingError(f"Parsing module '{module_name}' failed because of the following error: {e}") return processor_module async def run(self): async with self._granule_loader as (dataset, granule_name): start = time.perf_counter() shared_memory = self._manager.Namespace() async with Pool(initializer=_init_worker, initargs=(self._tile_processors, dataset, self._data_store_factory, self._metadata_store_factory, shared_memory), maxtasksperchild=self._max_concurrency, childconcurrency=self._max_concurrency) as pool: serialized_tiles = [nexusproto.NexusTile.SerializeToString(tile) for tile in self._slicer.generate_tiles(dataset, granule_name)] # aiomultiprocess is built on top of the stdlib multiprocessing library, which has the limitation that # a queue can't have more than 2**15-1 tasks. So, we have to batch it. for chunk in self._chunk_list(serialized_tiles, MAX_CHUNK_SIZE): try: await pool.map(_process_tile_in_worker, chunk) except ProxyException: pool.terminate() # Give the shared memory manager some time to write the exception # await asyncio.sleep(1) raise pickle.loads(shared_memory.error) end = time.perf_counter() logger.info("Pipeline finished in {} seconds".format(end - start)) @staticmethod def _chunk_list(items, chunk_size: int): return [items[i:i + chunk_size] for i in range(0, len(items), chunk_size)]