def download(self, index_path, txt_dir): # Save to txt dir self.txt_dir = txt_dir if not os.path.exists(self.txt_dir): os.makedirs(self.txt_dir) # Count Total Urls to Process with open(index_path, 'r') as fin: num_urls = sum(1 for line in fin) def iter_path_generator(index_path): with open(index_path, 'r') as fin: reader = csv.reader(fin, delimiter=',', quotechar='\"', quoting=csv.QUOTE_ALL) for url_idx, row in enumerate(reader, 1): form_type, company_name, cik, date_filed, filename = row url = os.path.join(SEC_GOV_URL, filename).replace("\\", "/") yield (url_idx, url) def download_job(obj): url_idx, url = obj fname = '_'.join(url.split('/')[-2:]) fname, ext = os.path.splitext(fname) htmlname = fname + '.html' text_path = os.path.join(self.txt_dir, fname + '.txt') if os.path.exists(text_path): print("Already exists, skipping {}...".format(url)) sys.stdout.write("\033[K") else: print("Total: {}, Downloading & Parsing: {}...".format( num_urls, url_idx)) sys.stdout.write("\033[K") r = requests.get(url) try: # Parse html with Beautiful Soup soup = BeautifulSoup(r.content, "html.parser") text = soup.get_text("\n") # Process Text text = self._process_text(text) text_path = os.path.join(self.txt_dir, fname + '.txt') # Write to file with codecs.open(text_path, 'w', encoding='utf-8') as fout: fout.write(text) except BaseException as e: print("{} parsing failed: {}".format(url, e)) ncpus = cpu_count() if cpu_count() <= 8 else 8 pool = ProcessPool(ncpus) pool.map(download_job, iter_path_generator(index_path))
def p_uimap(function, *arrays, num_cpus=None): """Returns an iterator for a parallel unordered map with a progress bar. Args: function: The function to apply to each element of the given arrays. arrays: One or more arrays of the same length containing the data to be mapped. num_cpus: The number of cpus to use in parallel. If an int, uses that many cpus. If a float, uses that proportion of cpus. If None, uses all available cpus. Returns: An iterator which will apply the function to each element of the given arrays in parallel with a progress bar. The results may be in any order. """ if num_cpus is None: num_cpus = cpu_count() elif type(num_cpus) == float: num_cpus = int(round(num_cpus * cpu_count())) iterator = tqdm(Pool(num_cpus).uimap(function, *arrays), total=len(arrays[0])) return iterator
def __init__(self, *, extractor: Extractor, normalizer_nld: Optional[NormalizerNLD] = None, normalizer_gsf: Optional[NormalizerGSF] = None, normalizer_simultan: Optional[NormalizerSimultan] = None, path: Optional[Union[str, Path]] = 'saved_run/normalizers', regenerate: bool = False): """ Args: extractor (Extractor): Extractor instance normalizer_nld (NormalizerNLD, optional): NormalizerNLD instance normalizer_gsf (NormalizerGSF, optional): NormalizerGSF instance normalizer_simultan (NormalizerSimultan, optional): NormalizerSimultan instance """ super().__init__(regenerate) self.extractor = extractor self.normalizer_nld = copy.deepcopy(normalizer_nld) self.normalizer_gsf = copy.deepcopy(normalizer_gsf) self.normalizer_simultan = copy.deepcopy(normalizer_simultan) self.nprocesses: int = cpu_count() - 1 if cpu_count() > 1 else 1 self.res: Optional[List[ResultsNormalized]] = None if path is None: self.path = None else: self.path = Path(path) self.path.mkdir(exist_ok=True, parents=True)
def _parallel(ordered: bool, function: Callable, *iterables: Iterable, **kwargs: Any) -> Generator: """Returns a generator for a parallel map with a progress bar. Arguments: ordered(bool): True for an ordered map, false for an unordered map. function(Callable): The function to apply to each element of the given Iterables. iterables(Tuple[Iterable]): One or more Iterables containing the data to be mapped. Returns: A generator which will apply the function to each element of the given Iterables in parallel in order with a progress bar. """ # Extract num_cpus num_cpus = kwargs.pop('num_cpus', None) # Determine num_cpus if num_cpus is None: num_cpus = cpu_count() elif type(num_cpus) == float: num_cpus = int(round(num_cpus * cpu_count())) # Determine length of tqdm (equal to length of shortest iterable) length = min(len(iterable) for iterable in iterables if isinstance(iterable, Sized)) # Create parallel generator map_type = 'imap' if ordered else 'uimap' pool = Pool(num_cpus) map_func = getattr(pool, map_type) for item in tqdm(map_func(function, *iterables), total=length, **kwargs): yield item pool.clear()
def _parallel(ordered, function, *arrays, **kwargs): """Returns an iterator for a parallel map with a progress bar. Arguments: ordered(bool): True for an ordered map, false for an unordered map. function(function): The function to apply to each element of the given arrays. arrays(tuple): One or more arrays of the same length containing the data to be mapped. If a non-list variable is passed, it will be repeated a number of times equal to the lengths of the list(s). If only non-list variables are passed, the function will be performed num_iter times. num_cpus(int): The number of cpus to use in parallel. If an int, uses that many cpus. If a float, uses that proportion of cpus. If None, uses all available cpus. num_iter(int): If only non-list variables are passed, the function will be performed num_iter times on these variables. Default: 1. Returns: An iterator which will apply the function to each element of the given arrays in parallel in order with a progress bar. """ # Convert tuple to list arrays = list(arrays) # Extract kwargs num_cpus = kwargs.pop('num_cpus', None) num_iter = kwargs.pop('num_iter', 1) # Determine num_cpus if num_cpus is None: num_cpus = cpu_count() elif type(num_cpus) == float: num_cpus = int(round(num_cpus * cpu_count())) # Determine num_iter when at least one list is present if any([type(array) == list for array in arrays]): num_iter = max([len(array) for array in arrays if type(array) == list]) # Convert single variables to lists # and confirm lists are same length for i, array in enumerate(arrays): if type(array) != list: arrays[i] = [array for _ in range(num_iter)] else: assert len(array) == num_iter # Create parallel iterator map_type = 'imap' if ordered else 'uimap' iterator = tqdm(getattr(Pool(num_cpus), map_type)(function, *arrays), total=num_iter, **kwargs) return iterator
def mlp(): if not os.path.exists(tar_dir): os.makedirs(tar_dir) iterator = glob(os.path.join(src_dir, '*.txt')) ncpus = cpu_count() if cpu_count() <= 8 else 8 pool = ProcessPool(ncpus) pool.map(preprocess_job, iterator)
def mp_map(function, sequence, *args, **kwds): '''extend python's parallel map function to multiprocessing Inputs: function -- target function sequence -- sequence to process in parallel Additional Inputs: nproc -- number of 'local' cpus to use [defaut = 'autodetect'] type -- processing type ['blocking', 'non-blocking', 'unordered'] threads -- if True, use threading instead of multiprocessing ''' processes = cpu_count() proctype = 'blocking' threads = False if kwds.has_key('nproc'): processes = kwds['nproc'] kwds.pop('nproc') # provide a default that is not a function call if processes == None: processes = cpu_count() if kwds.has_key('type'): proctype = kwds['type'] kwds.pop('type') if kwds.has_key('threads'): threads = kwds['threads'] kwds.pop('threads') # remove all the junk kwds that are added due to poor design! if kwds.has_key('nnodes'): kwds.pop('nnodes') if kwds.has_key('nodes'): kwds.pop('nodes') if kwds.has_key('launcher'): kwds.pop('launcher') if kwds.has_key('mapper'): kwds.pop('mapper') if kwds.has_key('queue'): kwds.pop('queue') if kwds.has_key('timelimit'): kwds.pop('timelimit') if kwds.has_key('scheduler'): kwds.pop('scheduler') if kwds.has_key('ncpus'): kwds.pop('ncpus') if kwds.has_key('servers'): kwds.pop('servers') if proctype in ['blocking']: if not threads: return mp.map(function,sequence,*args,**kwds) else: return tp.map(function,sequence,*args,**kwds) elif proctype in ['unordered']: if not threads: return mp.uimap(function,sequence,*args,**kwds) else: return tp.uimap(function,sequence,*args,**kwds) elif proctype in ['non-blocking', 'ordered']: if not threads: return mp.imap(function,sequence,*args,**kwds) else: return tp.imap(function,sequence,*args,**kwds) # default if not threads: return mp.map(function,sequence,*args,**kwds) else: return tp.map(function,sequence,*args,**kwds)
def mp_map(function, sequence, *args, **kwds): '''extend python's parallel map function to multiprocessing Inputs: function -- target function sequence -- sequence to process in parallel Additional Inputs: nproc -- number of 'local' cpus to use [defaut = 'autodetect'] type -- processing type ['blocking', 'non-blocking', 'unordered'] threads -- if True, use threading instead of multiprocessing ''' processes = cpu_count() proctype = 'blocking' threads = False if kwds.has_key('nproc'): processes = kwds['nproc'] kwds.pop('nproc') # provide a default that is not a function call if processes == None: processes = cpu_count() if kwds.has_key('type'): proctype = kwds['type'] kwds.pop('type') if kwds.has_key('threads'): threads = kwds['threads'] kwds.pop('threads') # remove all the junk kwds that are added due to poor design! if kwds.has_key('nnodes'): kwds.pop('nnodes') if kwds.has_key('nodes'): kwds.pop('nodes') if kwds.has_key('launcher'): kwds.pop('launcher') if kwds.has_key('mapper'): kwds.pop('mapper') if kwds.has_key('queue'): kwds.pop('queue') if kwds.has_key('timelimit'): kwds.pop('timelimit') if kwds.has_key('scheduler'): kwds.pop('scheduler') if kwds.has_key('ncpus'): kwds.pop('ncpus') if kwds.has_key('servers'): kwds.pop('servers') if proctype in ['blocking']: if not threads: return mp.map(function, sequence, *args, **kwds) else: return tp.map(function, sequence, *args, **kwds) elif proctype in ['unordered']: if not threads: return mp.uimap(function, sequence, *args, **kwds) else: return tp.uimap(function, sequence, *args, **kwds) elif proctype in ['non-blocking', 'ordered']: if not threads: return mp.imap(function, sequence, *args, **kwds) else: return tp.imap(function, sequence, *args, **kwds) # default if not threads: return mp.map(function, sequence, *args, **kwds) else: return tp.map(function, sequence, *args, **kwds)
def extract(self): def text_gen(txt_dir): # Yields markup & name for fname in os.listdir(txt_dir): if not fname.endswith('.txt'): continue yield fname def parsing_job(fname): print("Parsing: {}".format(fname)) # Read text filepath = os.path.join(self.txt_dir, fname) with codecs.open(filepath, 'rb', encoding='utf-8') as fin: text = fin.read() name, ext = os.path.splitext(fname) # Parse MDA part msg = "" mda, end = self.parse_mda(text) # Parse second time if first parse results in index if mda and len(mda.encode('utf-8')) < 1000: mda, _ = self.parse_mda(text, start=end) if mda: # Has value msg = "SUCCESS" mda_path = os.path.join(self.mda_dir, name + '.mda') with codecs.open(mda_path, 'w', encoding='utf-8') as fout: fout.write(mda) else: msg = msg if mda else "MDA NOT FOUND" print("{},{}".format(name, msg)) return name + '.txt', msg # ncpus = cpu_count() if cpu_count() <= 8 else 8 pool = ProcessPool(ncpus) _start = time.time() parsing_failed = pool.map( parsing_job, \ text_gen(self.txt_dir) ) _end = time.time() print("MDA parsing time taken: {} seconds.".format(_end - _start)) # Write failed parsing list count = 0 parsing_log = 'parsing.log' with open(parsing_log, 'w') as fout: print("Writing parsing results to {}".format(parsing_log)) for name, msg in parsing_failed: fout.write('{},{}\n'.format(name, msg)) if msg != "SUCCESS": count = count + 1 print("Number of failed text:{}".format(count))
def __init__(self, raw: Optional[Matrix] = None, bg: Optional[Matrix] = None, bg_ratio: float = 1, path: Optional[Union[str, Path]] = 'saved_run/ensemble'): """ Sets up attributes and loads a saved ensemble if provided. Args: raw: The model matrix to peturbate. If a background is provided, this is the "prompt+bg" matrix. bg: Background matrix to subtract. bg_ratio: Prompt is obtainied by `raw - bg_ratio * bg`. Defaults to 1. This is the case for equal time gate length of `prompt+bg` and `bg`. path: The path where to save the ensemble. If set, the ensemble will try to load from the path, but will fail *silently* if it is unable to. It is recommended to call load([path]) explicitly. """ self.raw: Optional[Matrix] = raw self.bg: Optional[Matrix] = bg self.bg_ratio: Optional[float] = bg_ratio self.prompt_w_bg: Optional[Matrix] = raw self.unfolder: Optional[Callable[[Matrix], Matrix]] = None self.first_generation_method: \ Optional[Callable[[Matrix], Matrix]] = None self.size = 0 self.regenerate = False self.action_prompt_w_bg = Action('matrix') self.action_bg = Action('matrix') self.action_raw = Action('matrix') self.action_unfolded = Action('matrix') self.action_firstgen = Action('matrix') self.std_raw: Optional[Matrix] = None self.std_unfolded: Optional[Matrix] = None self.std_firstgen: Optional[Matrix] = None self.raw_ensemble: Optional[Matrix] = None self.unfolded_ensemble: Optional[Matrix] = None self.firstgen_ensemble: Optional[Matrix] = None self.seed: int = 987654 self.nprocesses: int = cpu_count()-1 if cpu_count() > 1 else 1 if path is None: self.path = None else: self.path = Path(path) self.path.mkdir(exist_ok=True, parents=True) self.raw.state = "raw"
def __init__(self, *args, **kwds): """\nNOTE: if number of nodes is not given, will autodetect processors """ hasnodes = 'nodes' in kwds arglen = len(args) if 'ncpus' in kwds and (hasnodes or arglen): msg = "got multiple values for keyword argument 'ncpus'" raise TypeError(msg) elif hasnodes: #XXX: multiple try/except is faster? if arglen: msg = "got multiple values for keyword argument 'nodes'" raise TypeError(msg) kwds['ncpus'] = kwds.pop('nodes') elif arglen: kwds['ncpus'] = args[0] self.__nodes = kwds.get('ncpus', cpu_count()) # Create an identifier for the pool self._id = kwds.get('id', None) #'pool' if self._id is None: self._id = self.__nodes # Create a new server if one isn't already initialized self._serve() return
def upload_to_es(self, parallel=True, reindex_nested_features=False): """ Loops through each file, loads in dictionaries, combines, and uploads to elastic search inputs: reindex_nested_features (bool): optional boolean to reindex features for nested search """ if not parallel: for x in self.fulltext: if reindex_nested_features: self.upload_single_file_with_reindex(x) else: self.upload_single_file(x) else: # start multithreading pool processors = cpu_count() pool = pp.ProcessPool(processors) if reindex_nested_features: r = list( tqdm(pool.imap(self.upload_single_file_with_reindex, self.fulltext), total=len(self.fulltext))) else: r = list( tqdm(pool.imap(self.upload_single_file, self.fulltext), total=len(self.fulltext))) pool.close() pool.join()
def _serve(self, nodes=None, servers=None): #XXX: is a STATE method; use id """Create a new server if one isn't already initialized""" # get nodes and servers in form used by pp.Server if nodes is None: nodes = self.nodes #XXX: autodetect must be explicit if nodes in ['*']: nodes = 'autodetect' if servers is None: servers = tuple(sorted(self.__servers)) # no servers is () elif servers in ['*', 'autodetect']: servers = ('*',) # if no server, create one _pool = __STATE.get(self._id, None) if not _pool: _pool = pp.Server(ppservers=servers) # convert to form returned by pp.Server, then compare _auto = [('*',)] if _pool.auto_ppservers else [] _servers = sorted(_pool.ppservers + _auto) _servers = tuple(':'.join((str(i) for i in tup)) for tup in _servers) if servers != _servers: #XXX: assume servers specifies ports if desired _pool = pp.Server(ppservers=servers) # convert to form returned by pp.Server, then compare _nodes = cpu_count() if nodes=='autodetect' else nodes if _nodes != _pool.get_ncpus(): _pool.set_ncpus(nodes) # allows ncpus=0 # set (or 'repoint') the server __STATE[self._id] = _pool # set the 'self' internals self.__nodes = None if nodes in ['autodetect'] else nodes self.__servers = servers return _pool
def write_image_name_on_images_in_dir_parallel(img_files, input_folder_path, output_folder_path, override=False, xy=(0, 0), text_color=(255, 255, 255), font_size=40, background_color=(0, 0, 0), number_gpus=6): logger.info('write_image_name_on_images_in_dir_parallel: ...') logger.info('Using ' + str(number_gpus) + ' of ' + str(cpu_count()) + 'CPUs') from Utility.List_Extension import ListExtension chunks = ListExtension.split_list_in_n_parts(img_files, number_gpus) with ProcessingPool() as pool: results = [] for gpu_id in range(number_gpus): result = pool.apipe( ImageFileHandler.write_image_name_on_images_in_dir_sequential, *[chunks[gpu_id], input_folder_path, output_folder_path, override, xy, text_color, font_size, background_color] ) results.append(result) # Collect the asynchronous calls for result in results: result.get() logger.info('write_image_name_on_images_in_dir_parallel: Done')
def __init__(self, model, step_finish, args = None, split = 0, buffer = 6,\ recombine = None, recombine_args = None, verbose = False, \ boundary_pass = 1): self.model = self.grid_adjust(model) #add in function to finish steps self.step_finish = step_finish self.step_args = args #Get the number of CPUs unless user specified if split == 0: self.ncpus = cpu_count() else: self.ncpus = split #create the number of process available #self.pool = ProcessPool(nodes = self.ncpus) self.buffer = buffer self.multi_models = collections.OrderedDict() #dictionary to track when all steps on each processor complete self.sync_status = collections.OrderedDict() #print (self.pool) #add ability for user to deconflict self.boundary_pass = boundary_pass if recombine == None: self.recombine = self.recombine_default self.recombine_args = recombine_args else: self.recombine = recombine self.recombine_args = recombine_args self.verbose = verbose
def _serve(self, nodes=None, servers=None): #XXX: is a STATE method; use id """Create a new server if one isn't already initialized""" # get nodes and servers in form used by pp.Server if nodes is None: nodes = self.nodes #XXX: autodetect must be explicit if nodes in ['*']: nodes = 'autodetect' if servers is None: servers = tuple(sorted(self.__servers)) # no servers is () elif servers in ['*', 'autodetect']: servers = ('*', ) # if no server, create one _pool = __STATE.get(self._id, None) if not _pool: _pool = pp.Server(ppservers=servers) # convert to form returned by pp.Server, then compare _auto = [('*', )] if _pool.auto_ppservers else [] _servers = sorted(_pool.ppservers + _auto) _servers = tuple(':'.join((str(i) for i in tup)) for tup in _servers) if servers != _servers: #XXX: assume servers specifies ports if desired _pool = pp.Server(ppservers=servers) # convert to form returned by pp.Server, then compare _nodes = cpu_count() if nodes == 'autodetect' else nodes if _nodes != _pool.get_ncpus(): _pool.set_ncpus(nodes) # allows ncpus=0 # set (or 'repoint') the server __STATE[self._id] = _pool # set the 'self' internals self.__nodes = None if nodes in ['autodetect'] else nodes self.__servers = servers return _pool
def main_jagcat(): if os.path.exists('log\\' + GeneralConfig.ENV + '_' + GeneralConfig.TOPIC_PREFIX + '_' + GeneralConfig.ROWCOUNT_LOG_FILE): os.remove('log\\' + GeneralConfig.ENV + '_' + GeneralConfig.TOPIC_PREFIX + '_' + GeneralConfig.ROWCOUNT_LOG_FILE) pool = ProcessPool(nodes=cpu_count() - 1 or 1) pool.amap(send_part_meta, [PartMetaConfig.TOPIC], [PartMetaConfig.KAFKA_KEY]) pool.amap(send_intray, [IntrayConfig.TOPIC], [IntrayConfig.KAFKA_KEY]) pool.amap(send_description, [DescriptionConfig.TOPIC], [DescriptionConfig.KAFKA_KEY]) pool.amap(send_feature, [FeatureConfig.TOPIC], [FeatureConfig.KAFKA_KEY]) pool.amap(send_feature_family, [FeatureFamilyConfig.TOPIC], [FeatureFamilyConfig.KAFKA_KEY]) pool.amap(send_hierarchy, [HierarchyConfig.TOPIC], [HierarchyConfig.KAFKA_KEY]) pool.amap(send_hierarchy_illustration, [HierarchyIllustrationConfig.TOPIC], [HierarchyIllustrationConfig.KAFKA_KEY]) pool.amap(send_hierarchy_usage, [HierarchyUsageConfig.TOPIC], [HierarchyUsageConfig.KAFKA_KEY]) pool.amap(send_section_callout, [SectionCalloutConfig.TOPIC], [SectionCalloutConfig.KAFKA_KEY]) pool.amap(send_section_part_usage, [SectionPartUsageConfig.TOPIC], [SectionPartUsageConfig.KAFKA_KEY]) pool.amap(send_vin, [VinConfig.TOPIC], [VinConfig.KAFKA_KEY]) pool.close() pool.join()
def main(): if os.path.exists('log\\' + GeneralConfig.ENV + '_' + GeneralConfig.TOPIC_PREFIX + '_' + GeneralConfig.ROWCOUNT_LOG_FILE): os.remove('log\\' + GeneralConfig.ENV + '_' + GeneralConfig.TOPIC_PREFIX + '_' + GeneralConfig.ROWCOUNT_LOG_FILE) pool = ProcessPool(nodes=cpu_count() - 1 or 1) # pool.amap(send_aftermarket_part, [AftermarketPartConfig.TOPIC], [AftermarketPartConfig.KAFKA_KEY]) # pool.amap(send_description, [DescriptionConfig.TOPIC], [DescriptionConfig.KAFKA_KEY]) # pool.amap(send_engineering_part, [EngineeringPartConfig.TOPIC], [EngineeringPartConfig.KAFKA_KEY]) # pool.amap(send_engineering_part_function, [EngineeringPartFunctionConfig.TOPIC], [EngineeringPartFunctionConfig.KAFKA_KEY]) # pool.amap(send_engineering_part_usage, [EngineeringPartUsageConfig.TOPIC], [EngineeringPartUsageConfig.KAFKA_KEY]) # pool.amap(send_feature, [FeatureConfig.TOPIC], [FeatureConfig.KAFKA_KEY]) # pool.amap(send_feature_family, [FeatureFamilyConfig.TOPIC], [FeatureFamilyConfig.KAFKA_KEY]) pool.amap(send_hierarchy, [HierarchyConfig.TOPIC], [HierarchyConfig.KAFKA_KEY]) # pool.amap(send_hierarchy_illustration, [HierarchyIllustrationConfig.TOPIC], [HierarchyIllustrationConfig.KAFKA_KEY]) # pool.amap(send_hierarchy_usage, [HierarchyUsageConfig.TOPIC], [HierarchyUsageConfig.KAFKA_KEY]) # pool.amap(send_section_callout, [SectionCalloutConfig.TOPIC], [SectionCalloutConfig.KAFKA_KEY]) # pool.amap(send_section_part_usage, [SectionPartUsageConfig.TOPIC], [SectionPartUsageConfig.KAFKA_KEY]) # pool.amap(send_supersession, [SupersessionConfig.TOPIC], [SupersessionConfig.KAFKA_KEY]) # pool.amap(send_intray, [IntrayConfig.TOPIC], [IntrayConfig.KAFKA_KEY]) # pool.amap(send_vin, [VinConfig.TOPIC], [VinConfig.KAFKA_KEY]) pool.close() pool.join()
def download(self, index_path): def iter_path_generator(index_path): with open(index_path,'r') as fin: reader = csv.reader(fin,delimiter=',',quotechar='\"',quoting=csv.QUOTE_ALL) for row in reader: form_type, company_name, cik, date_filed, filename = row url = os.path.join(SEC_GOV_URL,filename) yield url def download_job(url): fname = '_'.join(url.split('/')[-2:]) fname, ext = os.path.splitext(fname) htmlname = fname + '.html' text_path = os.path.join(self.txt_dir,fname + '.txt') if os.path.exists(text_path): print("Already exists, skipping {}".format(url)) else: print("Downloading & Parsing {}".format(url)) r = requests.get(url) try: # Parse html with Beautiful Soup soup = BeautifulSoup( r.content, "html.parser" ) text = soup.get_text("\n") # Process Text text = self._process_text(text) text_path = os.path.join(self.txt_dir,fname + '.txt') # Write to file with codecs.open(text_path,'w',encoding='utf-8') as fout: fout.write(text) except BaseException as e: print("{} parsing failed: {}".format(url,e)) ncpus = cpu_count() if cpu_count() <= 8 else 8; pool = ProcessPool( ncpus ) pool.map( download_job, iter_path_generator(index_path) )
def _equals(self, server): "check if the server is compatible" if not server: return False _nodes = cpu_count() if self.__nodes is None else self.__nodes if _nodes != server.get_ncpus(): return False _auto = [('*',)] if server.auto_ppservers else [] _servers = sorted(server.ppservers + _auto) _servers = [':'.join((str(i) for i in tup)) for tup in _servers] return sorted(self.__servers) == _servers
def _equals(self, server): "check if the server is compatible" if not server: return False _nodes = cpu_count() if self.__nodes is None else self.__nodes if _nodes != server.get_ncpus(): return False _auto = [('*', )] if server.auto_ppservers else [] _servers = sorted(server.ppservers + _auto) _servers = [':'.join((str(i) for i in tup)) for tup in _servers] return sorted(self.__servers) == _servers
def __init__(self, *args, **kwds): """\nNOTE: if number of nodes is not given, will autodetect processors """ hasnodes = kwds.has_key('nodes'); arglen = len(args) if kwds.has_key('ncpus') and (hasnodes or arglen): msg = "got multiple values for keyword argument 'ncpus'" raise TypeError, msg elif hasnodes: #XXX: multiple try/except is faster? if arglen: msg = "got multiple values for keyword argument 'nodes'" raise TypeError, msg kwds['ncpus'] = kwds.pop('nodes') elif arglen: kwds['ncpus'] = args[0] self.__nodes = kwds.get('ncpus', cpu_count()) # Create a new server if one isn't already initialized if not __STATE['pool'] or self.__nodes != cpu_count(): __STATE['pool'] = Pool(self.__nodes) return
def run_qaqc(self, las_paths): if self.config.multiprocess: p = pp.ProcessPool(max(int(ph.cpu_count() / 2), 1)) num_las = len(las_paths) for _ in tqdm(p.imap(self.run_qaqc_checks_multiprocess, las_paths), total=num_las, ascii=True): pass p.close() p.join() p.clear() else: self.run_qaqc_checks(las_paths)
def avaliacao(populacao): x = valores(populacao) n = len(populacao) def steps(k): sequence = x[k, :] t = lm.move(startpoint, sequence=sequence) return t peso = None ncpu = cpu_count() with Pool(ncpu) as pool: peso = array(pool.map(steps, range(n))) return peso
def test_pathos_pp_callable () : """Test parallel processnig with pathos: ParallelPool """ logger = getLogger("ostap.test_pathos_pp_callable") if not pathos : logger.error ( "pathos is not available" ) return logger.info ('Test job submission with %s' % pathos ) if DILL_PY3_issue : logger.warning ("test is disabled (DILL/ROOT/PY3 issue)" ) return ## logger.warning ("test is disabled for UNKNOWN REASON") ## return from pathos.helpers import cpu_count ncpus = cpu_count () from pathos.pools import ParallelPool as Pool pool = Pool ( ncpus ) logger.info ( "Pool is %s" % ( type ( pool ).__name__ ) ) pool.restart ( True ) mh = MakeHisto() jobs = pool.uimap ( mh.process , [ ( i , n ) for ( i , n ) in enumerate ( inputs ) ] ) result = None for h in progress_bar ( jobs , max_value = len ( inputs ) ) : if not result : result = h else : result.Add ( h ) pool.close () pool.join () pool.clear () logger.info ( "Histogram is %s" % result.dump ( 80 , 10 ) ) logger.info ( "Entries %s/%s" % ( result.GetEntries() , sum ( inputs ) ) ) with wait ( 1 ) , use_canvas ( 'test_pathos_pp_callable' ) : result.draw ( ) return result
def _clear(self): #XXX: should be STATE method; use id """Remove server with matching state""" _pool = __STATE['server'] if not _pool: return # convert to form returned by pp.Server, then compare _nodes = cpu_count() if self.__nodes is None else self.__nodes if _nodes != _pool.get_ncpus(): return _auto = [('*',)] if _pool.auto_ppservers else [] _servers = sorted(_pool.ppservers + _auto) _servers = [':'.join((str(i) for i in tup)) for tup in _servers] if sorted(self.__servers) != _servers: return # it's the 'same' (better to check _pool.secret?) __STATE['server'] = None return #_pool
def _clear(self): #XXX: should be STATE method; use id """Remove server with matching state""" _pool = __STATE['server'] if not _pool: return # convert to form returned by pp.Server, then compare _nodes = cpu_count() if self.__nodes is None else self.__nodes if _nodes != _pool.get_ncpus(): return _auto = [('*', )] if _pool.auto_ppservers else [] _servers = sorted(_pool.ppservers + _auto) _servers = [':'.join((str(i) for i in tup)) for tup in _servers] if sorted(self.__servers) != _servers: return # it's the 'same' (better to check _pool.secret?) __STATE['server'] = None return #_pool
def avaliacao(self, populacao): n = len(populacao) def steps(k): individuo = populacao[k, :] obj = self.funcao_objetivo(individuo) return obj ncpu = cpu_count() pool = ProcessPool(nodes=ncpu) pesos = array(pool.map(steps, range(n))) pool.close() pool.join() pool.clear() shutdown() return pesos
def __init__(self, *args, **kwds): """\nNOTE: if number of nodes is not given, will try to grab the number of nodes from the associated scheduler, and failing will count the local cpus. If workdir is not given, will default to scheduler's workdir or $WORKDIR. If scheduler is not given, will default to only run on the current node. If pickle is not given, will attempt to minimially use TemporaryFiles. For more details, see the docstrings for the "map" method, or the man page for the associated launcher (e.g mpirun, mpiexec). """ Mapper.__init__(self, *args, **kwds) self.scatter = bool(kwds.get('scatter', False)) #XXX: hang w/ nodes=1 ? #self.nodes = kwds.get('nodes', None) if not len(args) and not kwds.has_key('nodes'): if self.scheduler: self.nodes = self.scheduler.nodes else: self.nodes = cpu_count() return
def __init__(self, *args, **kwds): """\nNOTE: if number of nodes is not given, will autodetect processors """ hasnodes = kwds.has_key('nodes') arglen = len(args) if kwds.has_key('ncpus') and (hasnodes or arglen): msg = "got multiple values for keyword argument 'ncpus'" raise TypeError, msg elif hasnodes: #XXX: multiple try/except is faster? if arglen: msg = "got multiple values for keyword argument 'nodes'" raise TypeError, msg kwds['ncpus'] = kwds.pop('nodes') elif arglen: kwds['ncpus'] = args[0] self.__nodes = kwds.get('ncpus', cpu_count()) # Create a new server if one isn't already initialized self._serve() return
def __init__( self, ncpus='autodetect', ppservers=None , silent = False ) : if ncpus == 'autodetect' : from pathos.helpers import cpu_count self.ncpus = cpu_count() else : self.ncpus = ncpus if ppservers : self._ppservers = ppservers self.sessions = [ ppServer(srv) for srv in ppservers ] self.ppservers = tuple ( [ i.local_server for i in self.sessions ] ) from pathos.parallel import ParallelPool as PPPool self.pool = PPPool( ncpus = self.ncpus , ppservers=self.ppservers) self.mode = 'cluster' from pathos.parallel import stats as pp_stats self.pp_stats = pp_stats else : from pathos.multiprocessing import ProcessPool as MPPool self.pool = MPPool(self.ncpus) self.mode = 'multicore' self.stats = {} self.silent = True if silent else False
def __init__(self, model, step_finish, args = None, split = 0, buffer = 6,\ recombine = None,recombine_args = None, verbose = False, \ boundary_pass = 1): #Creates an instance of the model changing the Mesa space to the #distributed space self.model = self.grid_adjust(model) #add in function to finish steps self.step_finish = step_finish #optional arguments for step_finish function self.step_args = args #Get the number of CPUs unless user specified if split == 0: self.ncpus = cpu_count() else: self.ncpus = split #create the number of process available self.pool = ProcessPool(nodes=self.ncpus) self.pipes = self.pipe_setup(self.ncpus) #buffer size self.buffer = buffer self.multi_models = collections.OrderedDict() #dictionary to track when all steps on each processor complete self.sync_status = collections.OrderedDict() #add ability for user to deconflict self.boundary_pass = boundary_pass #Either use the default recombine function or the user adds their own if recombine == None: self.recombine = self.recombine_default self.recombine_args = recombine_args else: self.recombine = recombine self.recombine_args = recombine_args #if True tells the user what is being recombined and what is being #ignored self.verbose = verbose
def __init__(self, *args, **kwds): """\nNOTE: if number of nodes is not given, will autodetect processors """ hasnodes = 'nodes' in kwds; arglen = len(args) if 'ncpus' in kwds and (hasnodes or arglen): msg = "got multiple values for keyword argument 'ncpus'" raise TypeError(msg) elif hasnodes: #XXX: multiple try/except is faster? if arglen: msg = "got multiple values for keyword argument 'nodes'" raise TypeError(msg) kwds['ncpus'] = kwds.pop('nodes') elif arglen: kwds['ncpus'] = args[0] self.__nodes = kwds.get('ncpus', cpu_count()) # Create an identifier for the pool self._id = 'pool' # Create a new server if one isn't already initialized self._serve() return
def test_pathos_mp_function () : """Test parallel processnig with pathos: ProcessPool """ logger = getLogger("ostap.test_pathos_mp_function") if not pathos : logger.error ( "pathos is not available" ) return logger.info ('Test job submission with %s' % pathos ) if DILL_PY3_issue : logger.warning ("test is disabled (DILL/ROOT/PY3 issue)" ) return from pathos.helpers import cpu_count ncpus = cpu_count () from pathos.pools import ProcessPool as Pool pool = Pool ( ncpus ) logger.info ( "Pool is %s" % ( type ( pool ).__name__ ) ) with pool_context ( pool ) : jobs = pool.uimap ( make_histo , zip ( count() , inputs ) ) result = None for h in progress_bar ( jobs , max_value = len ( inputs ) ) : if not result : result = h else : result.Add ( h ) logger.info ( "Histogram is %s" % result.dump ( 80 , 10 ) ) logger.info ( "Entries %s/%s" % ( result.GetEntries() , sum ( inputs ) ) ) with wait ( 1 ) , use_canvas ( 'test_pathos_mp_function' ) : result.draw ( ) return result
def get_returns(tickers, folder=None, freq='d', fromdate='2000-01-01', todate='2018-12-31', forward_bars=None, data_col=None, is_log=False, is_debug=False, is_multiprocess=False): if data_col is None: data_col = 'Adj Close' res = {} tqdm, ascii = get_tqdm() logger = get_logger() logger.info('Loading Yahoo Labels...') def _get_return(ticker, res=res, folder=folder, freq=freq, fromdate=fromdate, todate=todate, is_debug=is_debug, forward_bars=forward_bars, data_col=data_col, is_log=is_log): if is_debug: logger = get_logger() logger.info('Loading %s' % ticker) res[ticker] = YahooProcessor._get_return(ticker, folder, freq, fromdate, todate, forward_bars, data_col, is_log) return ticker, res[ticker] if is_multiprocess: logger.info('Initalized Multiprocess To Get Returns...') with Pool(cpu_count()) as p: res_pool = list(tqdm(p.imap(_get_return, tickers), total=len(tickers), ascii=ascii)) res = {item[0]: item[1] for item in res_pool} else: list(tqdm(map(_get_return, tickers), total=len(tickers), ascii=ascii)) return res
def set_num_threads(value=ph.cpu_count()): """Sets and updates the global NUM_THREADS variable. Parameters ---------- value : int, optional The number of threads the program is allowed to use (default: max available threads). Raises ------ TypeError When the NUM_THREADS global variable is not an integer. ValueError When the NUM_THREADS global variable is smaller then 1. """ if not isinstance(value, int): raise TypeError("The NUM_THREADS variable needs to be of type integer", value) if value < 1: raise ValueError( "The NUM_THREADS variable needs to be higher than zero", value) else: CONFIG_DATA.set('COMMON', 'NUM_THREADS', str(value))
def check_nodes(pool, state): new_pool = type(pool) nodes = cpu_count() if nodes < 2: return half = nodes//2 res = pool.map(squared, range(2)) assert res == [0, 1] pool.close() # doesn't create a new pool... IS IT BETTER IF IT DOES? pool = new_pool() try: pool.map(squared, range(2)) except PoolClosedError: pass else: raise AssertionError # creates a new pool (nodes are different) def nnodes(pool): return getattr(pool, '_'+new_pool.__name__+'__nodes') old_nodes = nnodes(pool) pool = new_pool(nodes=half) new_nodes = nnodes(pool) if isinstance(pool, ParallelPool): print('SKIPPING: new_pool check for ParallelPool')#FIXME else: res = pool.map(squared, range(2)) assert res == [0, 1] assert new_nodes < old_nodes pool.close() try: pool.map(squared, range(2)) except PoolClosedError: pass else: raise AssertionError # creates a new pool (nodes are different) pool = new_pool() if isinstance(pool, ParallelPool): print('SKIPPING: new_pool check for ParallelPool')#FIXME else: res = pool.map(squared, range(2)) assert res == [0, 1] pool.close() # doesn't create a new pool... IS IT BETTER IF IT DOES? pool = new_pool() try: pool.map(squared, range(2)) except PoolClosedError: pass else: raise AssertionError assert len(state) == 1 pool.clear() assert len(state) == 0 pool = new_pool() res = pool.map(squared, range(2)) assert res == [0, 1] assert len(state) == 1 pool.terminate() assert len(state) == 1 pool.clear() assert len(state) == 0 return
_HOLD = [] return def _debug(boolean): """if True, print debuging info and save temporary files after pickling""" if boolean: log.setLevel(logging.DEBUG) _save(True) else: log.setLevel(logging.WARN) _save(False) return _pid = '.' + str(os.getpid()) + '.' defaults = { 'nodes' : str(cpu_count()), 'program' : which_strategy(lazy=True) or 'ezscatter.py', # serialize to tempfile 'mpirun' : which_mpirun() or 'mpiexec', 'python' : which_python(lazy=True) or 'python', 'progargs' : '', 'outfile' : 'results%sout' % _pid, 'errfile' : 'errors%sout' % _pid, 'jobfile' : 'job%sid' % _pid, 'scheduler' : '', 'timelimit' : '00:02', 'queue' : 'normal', 'workdir' : '.' }