def process_iteration(Ns, ps, landscape, config): output_dir = config.output_dir + config.ext if config.background_image != None: background_path = config.input_dir + "/" + config.background_image else: background_path = None #Create a point to hold the iteration p = Point() p.add_iteration() #draw_population(Ns[0], landscape, ps.totalK, 0, output_dir, 2.0, background_path) if config.display: pool = mp.Pool(config.num_processors) for t in xrange(min(ps.max_time_steps, len(Ns))): if config.display: pool.apply_async(draw_population, [Ns[t], landscape, ps.totalK, t, output_dir, 2.0, background_path]) p.add_time_step([t] + population_statistics(ps, landscape, Ns[t])) pool.close() #Write the iteration results to file as a trajectory containing a single point write_trajectories([Trajectory(points=[p])], None, ps.sentinels, output_dir + "/results.txt") if config.save_time_steps: np.savez(output_dir + "/populations.npz", *Ns) pool.join()
def pass_data_to_search(symbol,path,start_time_seconds,end_time_seconds,date,time_interval,tt,code_path): jobs=[] dic_files={} lis=[] slot_results=[] file_name = path+'b'+date+'.l.bz2' # file_name = path + date+'/'+dic_files[lis[index]]+'.bz2' size=os.path.getsize(file_name) total_rows=size/69 total_processes1=40 slots=total_rows/total_processes1 #Multiprocessing each file as chunk # mapper(0,slots,total_processes1,symbol,start_time_seconds,end_time_seconds,date,time_interval,file_name,tt,code_path) # mapper(1,slots,total_processes1,symbol,start_time_seconds,end_time_seconds,date,time_interval,file_name,tt,code_path) pool = multiprocessing.Pool(total_processes1) for i in range(total_processes1): pool.apply_async(mapper, args = (i,slots,total_processes1,symbol,start_time_seconds,end_time_seconds,date,time_interval,file_name,tt,code_path)) pool.close() pool.join()
def _listArtifacts(self, urls, gavs): """ Loads maven artifacts from list of GAVs and tries to locate the artifacts in one of the specified repositories. :param urls: repository URLs where the given GAVs can be located :param gavs: List of GAVs :returns: Dictionary where index is MavenArtifact object and value is it's repo root URL. """ def findArtifact(gav, urls, artifacts): artifact = MavenArtifact.createFromGAV(gav) for url in urls: if maven_repo_util.gavExists(url, artifact): #Critical section? artifacts[artifact] = ArtifactSpec(url, [ArtifactType(artifact.artifactType, True, set(['']))]) return logging.warning('Artifact %s not found in any url!', artifact) artifacts = {} pool = ThreadPool(maven_repo_util.MAX_THREADS) for gav in gavs: pool.apply_async(findArtifact, [gav, urls, artifacts]) # Close the pool and wait for the workers to finnish pool.close() pool.join() return artifacts
def main(): if len(sys.argv) < 3: print("Syntax:") print( " {} [min_yeses] [out_csv_file]".format( sys.argv[0] ) ) sys.exit(1) min_yeses = eval(sys.argv[1]) out_csv_file = sys.argv[2] pconfig = config.PaperworkConfig() pconfig.read() src_dir = pconfig.settings['workdir'].value print("Source work directory : {}".format(src_dir)) src_dsearch = docsearch.DocSearch(src_dir) src_dsearch.reload_index() nb_threads = multiprocessing.cpu_count() pool = multiprocessing.pool.ThreadPool(processes=nb_threads) with open(out_csv_file, 'a', newline='') as csvfile: csvwriter = csv.writer(csvfile) for min_yes in min_yeses: pool.apply_async( _run_simulation, (src_dsearch, min_yes, csvwriter,) ) pool.close() pool.join() print("All done !")
def papply( f, seq, pool_size=cores, callback=None ): """ Apply the given function to each element of the given sequence, optionally invoking the given callback with the result of each application. Do so in parallel, using a thread pool no larger than the given size. :param callable f: the function to be applied :param Sequence seq: the input sequence :param int pool_size: the desired pool size, if absent the number of CPU cores will be used. The actual pool size may be smaller if the input sequence is small. :param callable callback: an optional function to be invoked with the return value of f >>> l=[]; papply( lambda a, b: a + b, [], 1, callback=l.append ); l [] >>> l=[]; papply( lambda a, b: a + b, [ (1, 2) ], 1, callback=l.append); l [3] >>> l=[]; papply( lambda a, b: a + b, [ (1, 2), (3, 4) ], 1, callback=l.append ); l [3, 7] """ if pool_size == 1: for args in seq: result = apply( f, args ) if callback is not None: callback( result ) else: with thread_pool( min( pool_size, len( seq ) ) ) as pool: for args in seq: pool.apply_async( f, args, callback=callback )
def create_process_pool(index): print index li = range(3) pool = multiprocessing.Pool(processes = len(li)) for sub_index in li: pool.apply_async(print_process_index, (index, sub_index)) pool.close() pool.join()
def update_all(opts): """Updates all menus""" pool = NoDaemonPool(processes=5) pool.apply_async(update_applications, (opts,)) pool.apply_async(update_bookmarks, (opts,)) pool.apply_async(update_recent_files, (opts,)) pool.apply_async(update_devices, (opts,)) pool.apply_async(update_rootmenu, (opts,)) pool.close() pool.join()
def buildList(self): """ Build the artifact "list" from sources defined in the given configuration. :returns: Dictionary described above. """ priority = 0 pool_dict = {} for source in self.configuration.artifactSources: priority += 1 pool = pool_dict.setdefault(source['type'], ThreadPool(self.MAX_THREADS_DICT[source['type']])) pool.apply_async(self._read_artifact_source, args=[source, priority], callback=self._add_result) for pool in pool_dict.values(): pool.close() at_least_1_runs = True all_keys = range(1, len(self.configuration.artifactSources) + 1) finished = False while at_least_1_runs: for i in range(30): time.sleep(1) if not self.errors.empty(): for pool in pool_dict.values(): logging.debug("Terminating pool %s", str(pool)) pool.terminate() finished = True break at_least_1_runs = False if not finished: self.results_lock.acquire() finished = sorted(list(self.results.keys())) self.results_lock.release() if all_keys != finished: logging.debug("Still waiting for priorities %s to finish", str(list(set(all_keys) - set(finished)))) at_least_1_runs = True for pool in pool_dict.values(): if pool._state != multiprocessing.pool.TERMINATE: pool.join() if not self.errors.empty(): raise RuntimeError("%i error(s) occured during reading of artifact list." % self.errors.qsize()) return self._get_artifact_list()
def papply( f, seq, pool_size=cores, callback=None ): """ Apply the given function to each element of the given sequence, optionally invoking the given callback with the result of each application. Do so in parallel, using a thread pool no larger than the given size. :param callable f: the function to be applied :param Sequence seq: the input sequence :param int pool_size: the desired pool size, if absent the number of CPU cores will be used. The actual pool size may be smaller if the input sequence is small.A pool size of 0 will make this function emulate the apply() builtin, i.e. f (and the callback, if provided) will be invoked serially in the current thread. :param callable callback: an optional function to be invoked with the return value of f >>> l=[]; papply( lambda a, b: a + b, [], pool_size=0, callback=l.append ); l [] >>> l=[]; papply( lambda a, b: a + b, [ (1, 2) ], pool_size=0, callback=l.append); l [3] >>> l=[]; papply( lambda a, b: a + b, [ (1, 2), (3, 4) ], pool_size=0, callback=l.append ); l [3, 7] >>> l=[]; papply( lambda a, b: a + b, [], pool_size=1, callback=l.append ); l [] >>> l=[]; papply( lambda a, b: a + b, [ (1, 2) ], pool_size=1, callback=l.append); l [3] >>> l=[]; papply( lambda a, b: a + b, [ (1, 2), (3, 4) ], pool_size=1, callback=l.append ); l [3, 7] >>> l=[]; papply( lambda a, b: a + b, [], pool_size=2, callback=l.append ); l [] >>> l=[]; papply( lambda a, b: a + b, [ (1, 2) ], pool_size=2, callback=l.append); l [3] >>> l=[]; papply( lambda a, b: a + b, [ (1, 2), (3, 4) ], pool_size=2, callback=l.append ); l [3, 7] """ __check_pool_size( pool_size ) n = len( seq ) if n: if pool_size == 0: for args in seq: result = apply( f, args ) if callback is not None: callback( result ) else: with thread_pool( min( pool_size, n ) ) as pool: for args in seq: pool.apply_async( f, args, callback=callback )
def func_wrapper(*args, **kwargs): """Closure for function.""" pool = multiprocessing.pool.ThreadPool(processes=1) async_result = pool.apply_async(item, args, kwargs) # raises a TimeoutError if execution exceeds max_timeout # print async_result.get(max_timeout) return async_result.get(max_timeout)
def run_trajectory(t, ps, landscape, ptv, num_iterations, num_processors): # Get the points in the trajectory points = t.points() # Determine the index of each unique point (sometimes points are equal due to rounding) uinds = [i for i, p in enumerate(points) if i == 0 or not p.equals(points[i - 1])] # Create a process pool, using as many processors as are available, or # are required to allow each point to run concurrently pool = mp.Pool(processes=min(num_processors, len(points))) results = [] for i in uinds: # Modify the parameter set to match the current point psm = ps.copy() psm.modify_for_point(points[i], ptv) psm.convert_to_age_classes() # Launch a process to run the simulation(s) for the point. This modifies the point in place args = [points[i], psm, landscape, num_iterations, num_processors] results.append(pool.apply_async(run_iterations_for_point, args)) pool.close() pool.join() # Merge the unique and non-unique points back together for i, r in zip(uinds, results): points[i] = r.get(None) # Return a new trajectory containing the results for each point return io.Trajectory(points=points)
def func_wrapper(self, *args, **kwargs): """Closure for function.""" pool = multiprocessing.pool.ThreadPool(processes=1) async_result = pool.apply_async(f, (self,) + args, kwargs) timeout = kwargs.pop('timeout_max_timeout', max_timeout) or max_timeout # raises a TimeoutError if execution exceeds max_timeout return async_result.get(timeout)
def compute_stats(client_factory, db_names=None, table_names=None, continue_on_error=False, parallelism=multiprocessing.cpu_count()): """ Runs COMPUTE STATS over the selected tables. The target tables can be filtered by specifying a list of databases and/or table names. If no filters are specified this will run COMPUTE STATS on all tables in all databases. parallelism controls the size of the thread pool to which compute_stats is sent. """ logging.info("Enumerating databases and tables for compute stats.") pool = multiprocessing.pool.ThreadPool(processes=parallelism) futures = [] with client_factory() as impala_client: all_dbs = set(name.split('\t')[0].lower() for name in impala_client.execute("show databases").data) selected_dbs = all_dbs if db_names is None else set(db_names) for db in all_dbs.intersection(selected_dbs): all_tables =\ set([t.lower() for t in impala_client.execute("show tables in %s" % db).data]) selected_tables = all_tables if table_names is None else set(table_names) for table in all_tables.intersection(selected_tables): # Submit command to threadpool futures.append(pool.apply_async(compute_stats_table, (client_factory, db, table, continue_on_error,))) # Wait for all stats commands to finish for f in futures: f.get()
def _run_tests(self): pool = multiprocessing.pool.ThreadPool(processes=self.suite_concurrency) outstanding_suites = [] for suite in self.suite_runners: suite.task = pool.apply_async(suite.run) outstanding_suites.append(suite) ret = True try: while len(outstanding_suites) > 0: for suite in list(outstanding_suites): if suite.timed_out(): msg = "Task %s not finished within timeout %s" % (suite.name, suite.suite.timeout_minutes,) logging.error(msg) raise Exception(msg) task = suite.task if task.ready(): this_task_ret = task.get() outstanding_suites.remove(suite) if this_task_ret: logging.info("Suite %s succeeded.", suite.name) else: logging.info("Suite %s failed.", suite.name) ret = False time.sleep(5) except KeyboardInterrupt: logging.info("\n\nDetected KeyboardInterrupt; shutting down!\n\n") raise finally: pool.terminate() return ret
def parallel_reduce(func, iterable, processes= 4, args=(), kwargs={}): #print "Made it to parallel reduce!" #print 'Iterable Set to Reduce: ', iterable comp_stack = list(iterable) pair_list = [] pool = multiprocessing.pool.Pool(processes) while len(comp_stack) > 1: while len(comp_stack) > 1: pair_list.append((comp_stack.pop(), comp_stack.pop())) #print 'List of pairs to reduce: ', pair_list results = [] while len(pair_list) > 0: pair = pair_list.pop() results.append(pool.apply_async(func, pair)) #print 'Async Result Objects: ', results while True: if all([result.ready() for result in results]): break comp_stack = [result.get() for result in results] #print 'After reduce: ', comp_stack return comp_stack
def from_carrays(path, format_categories='bcolz', format_codes='bcolz', format_values='bcolz', parallel=True): assert os.path.exists(path), 'No path {}'.format(path) df_columns = glob.glob(os.path.join(path, '*')) df = dict() if parallel: pool = multiprocessing.pool.ThreadPool() results = [] for i, k in enumerate(df_columns): p = pool.apply_async(_from_carray, args=(k,), kwds={'format_categories': format_categories, 'format_codes': format_codes, 'format_values': format_values}) results.append(p) pool.close() pool.join() for x in results: meta, s = x.get() df[meta['name']] = s else: for i, k in enumerate(df_columns): meta, s = _from_carray(k, format_categories=format_categories, format_codes=format_codes, format_values=format_values) df[meta['name']] = s # # # this is slow when we have non categoricals as series for some reason with log.timedlogger('constructing dataframe from %s column dict' % len(df)): df = pandas.DataFrame(df) # TODO: fast DataFrame constructor return df
def main(): if len(sys.argv) > 1: _, pkg_name, pkg_version = sys.argv download_package(pkg_name, pkg_version) return pool = multiprocessing.pool.ThreadPool(processes=min(multiprocessing.cpu_count(), 4)) results = [] for requirements_file in REQUIREMENTS_FILES: # If the package name and version are not specified in the command line arguments, # download the packages that in requirements.txt. # requirements.txt follows the standard pip grammar. for line in open(requirements_file): # A hash symbol ("#") represents a comment that should be ignored. line = line.split("#")[0] # A semi colon (";") specifies some additional condition for when the package # should be installed (for example a specific OS). We can ignore this and download # the package anyways because the installation script(bootstrap_virtualenv.py) can # take it into account. l = line.split(";")[0].strip() if not l: continue pkg_name, pkg_version = l.split('==') results.append(pool.apply_async( download_package, args=[pkg_name.strip(), pkg_version.strip()])) for x in results: x.get()
def _queue_job(self, pool, key, data_file, data_file_size): pool.apply_async( _fetch_and_process_chunk, [], { "app_config": self.config, "debug": self.debug, "data_file": data_file, "data_file_size": data_file_size, "download_progress_per_file": self.download_progress_per_file, "site": self.site, "pgdata": self.pgdata, "tablespaces": self.tablespaces, }, lambda *args: self.job_completed(key), lambda exception: self.job_failed(key, exception), )
def test_multi_own_ca(self): pool = multiprocessing.pool.ThreadPool(processes=5) threads = [] for i in range(5): threads.append(pool.apply_async(issue_n_certs, ("ownca", range(5)))) vals = [] for t in threads: vals.extend(t.get()) nt.assert_equal(sorted(vals), sorted(list(set(vals))))
def update(opts): opts = options_from_config(opts) actions = [] if opts.with_applications: actions.append(update_applications) if opts.with_bookmarks: actions.append(update_bookmarks) if opts.with_recent_files: actions.append(update_recent_files) if opts.with_devices: actions.append(update_devices) num_actions = len(actions) if num_actions == 1: actions[0](opts) else: pool = NoDaemonPool(processes=num_actions) for action in actions: pool.apply_async(action, (opts, True)) pool.close() pool.join()
def prepopulate(gitURLs): pool = multiprocessing.pool.ThreadPool(_CONCURRENCY) futures = [] for url in gitURLs: if url in _cache: continue mirror = repomirror.RepoMirror(url) future = pool.apply_async(_fetchSubthread, args=(mirror,)) futures.append(future) for future in futures: future.get()
def test_multithread(self): ca = ezbakeca.EzbakeCA("threadingCA") pool = multiprocessing.pool.ThreadPool(processes=5) threads = [] for i in range(5): threads.append(pool.apply_async(issue_n_certs, (ca, range(5)))) vals = [] for t in threads: vals.extend(t.get()) nt.assert_equal(sorted(vals), sorted(list(set(vals))))
def test_recursive_parallel_reduce(workers = 5): pool = RecursivePool() ranges = [range(1, 5), range(2, 9), range(3, 7)] print ranges results = [] for myrange in ranges: pool.apply_async(parallel_reduce, [sum, myrange], callback= results.append) pool.close() pool.join() print results #if __name__ == '__main__': # test_recursive_parallel_reduce()
def test_multithread(self): ca = ezbakeca.EzbakeCA("threadingCA") ca.save() pool = multiprocessing.pool.ThreadPool(processes=5) threads = [] for i in range(5): threads.append(pool.apply_async(issue_n_certs, (ca, range(5)))) vals = [] for t in threads: vals.extend(t.get()) ca.save() # save since the threads might still be writing the serial file nt.assert_equal(sorted(vals), sorted(list(set(vals))))
def do_get_sample_prompts_list(self): pool = multiprocessing.pool.ThreadPool(processes=8) # Kick off the "Current" meta-sample current_metasample_async = pool.apply_async(self.do_get_current_prompt) # Read all of the prompts in sample_prompts paths = glob.iglob('sample_prompts/*.fish') sample_results = pool.map(self.read_one_sample_prompt, paths, 1) # Finish up result = [] result.append(current_metasample_async.get()) result.extend([r for r in sample_results if r]) return result
def run_parrallel_iterations(ps, landscape, config): iteration = 0 if config.num_iterations > 1: means = [np.zeros(landscape.shape) for _ in xrange(ps.max_time_steps)] #run_iteration(ps, landscape) #Perform the iterations using a process pool for concurrency pool = mp.Pool(num_processors) print "running...", num_iterations, "iterations for", ps.max_time_steps, "timesteps on", num_processors, "processors" results = [ pool.apply_async(run_iteration, args=[ps, landscape]) for _ in xrange(num_iterations) ] pool.close() #Process iterations as they complete while len(results) > 0: completed = [i for i, r in enumerate(results) if r.ready()] for i in reversed( completed ): #reversed so that indices aren't invalidated as we pop print "processing iteration " + str(iteration + 1) #Get the result from the list and save the iteration to file Ns = results.pop(i).get(None) config.ext = "/iteration " + str(iteration + 1) process_iteration(Ns, ps, landscape, config) iteration += 1 if config.num_iterations > 1: #Add the population for each time step in the iteration to the total for t, N in enumerate(Ns): means[t] += N time.sleep(15) pool.close() pool.join() #run_iteration(ps, landscape) if config.num_iterations > 1: for N in means: N /= config.num_iterations config.ext = "/means" io.process_iteration(means, ps, landscape, config)
def run_parallel(*args, **kwargs): # pylint: disable=missing-docstring if 'callback_' in kwargs: callback = kwargs['callback_'] del kwargs['callback_'] else: callback = None async_res = dict() pool = multiprocessing.pool.ThreadPool(len(self.nodes)) for node in self.nodes: if callback: node_callback = _insert_arg0(callback, node) else: node_callback = None func = getattr(node, name) res = pool.apply_async(func, args, kwargs, node_callback) async_res[node] = res return {n: r.get() for n, r in async_res.iteritems()}
def run(jobs, threads=None): if threads is None: threads = len(jobs) pool = multiprocessing.pool.ThreadPool(processes=threads) try: futures = [] for job in jobs: kwargs = dict(job) args = () del kwargs['callback'] if 'args' in job: args = job['args'] del kwargs['args'] futures.append(pool.apply_async(_safeRun, args=(job['callback'], args, kwargs))) for future in futures: future.wait(timeout=2 ** 31) for future in futures: future.get() finally: pool.close()
def main(): parser = argparse.ArgumentParser(description='Download All Photos from iCloud') parser.add_argument('-apple_id', required=True, help="Your AppleID (password must be in KeyChain") parser.add_argument('-password', required=True) # TODO switch to using keyring parser.add_argument('-folder', required=True, help='Path to Download Photos To') app_args = parser.parse_args() app_args.folder = os.path.expanduser(app_args.folder) with multiprocessing.pool.ThreadPool(10) as pool: api = PyiCloudService(app_args.apple_id, app_args.password) photos = list(api.photos.all) file_dups = dict() for photo in photos: if photo.filename not in file_dups: file_dups[photo.filename] = 1 else: file_dups[photo.filename] += 1 file_dups = {fname: count for fname, count in file_dups.items() if count > 1} for photo in photos: fname = photo.filename if fname in file_dups: assert file_dups[fname] > 0 file_dups[fname] -= 1 if file_dups[fname] > 0: name, ext = fname.rsplit('.', 1) fname = '{}-{}.{}'.format(name, file_dups[fname], ext) photo_path = os.path.join(app_args.folder, fname) if os.path.exists(photo_path) and os.path.getsize(photo_path) == photo.size: continue r = pool.apply_async(download_photo, [photo, photo_path]) pool.close() pool.join()
def main(): readfilepath = sys.argv[1] writefilepath = sys.argv[2] with open(readfilepath) as readfile: target = readfile.readline() target = target.replace(" ", "").replace("\n", "").split(",") condition = {} for item in target: condition[item.split(":")[0]] = item.split(":")[1] target = readfile.readline() condition["sni"] = condition.get("sni", "on") if condition["sni"].lower() in ["on", "true", "1"]: condition["sni"] = True else: condition["sni"] = False condition["host"] = condition["host"].encode() condition["port"] = int(condition["port"]) condition["process_num"] = int(condition.get("process_num", 1)) print(condition) with closing(Pool(condition["process_num"])) as pool: while target: target = target.replace(" ", "").replace("\n", "").split("-") startip = ipaddress.ip_address(target[0]) if len(target) > 1: finiship = ipaddress.ip_address(target[1]) else: finiship = ipaddress.ip_address(target[0]) currentip = startip - 1 while currentip < finiship: currentip = currentip + 1 nowip = str(currentip) process = pool.apply_async( check_host, [nowip, condition, writefilepath]) target = readfile.readline() pool.close() pool.join()
elif cmd[0] == 'mul': regs[cmd[1]] *= val(cmd[2]) elif cmd[0] == 'mod': regs[cmd[1]] %= val(cmd[2]) elif cmd[0] == 'rcv': if inqueue: regs[cmd[1]] = inqueue.get() elif regs[cmd[1]] != 0: return played elif cmd[0] == 'jgz': if val(cmd[1]) > 0: pc += val(cmd[2]) continue pc += 1 return count print('PART 1:', run(0, None, None)) pool = multiprocessing.pool.ThreadPool(processes=2) q1 = multiprocessing.Queue() q2 = multiprocessing.Queue() res1 = pool.apply_async(run, (0, q1, q2)) res2 = pool.apply_async(run, (1, q2, q1)) res1.get() print('PART 2:', res2.get())
mongo = MongoDBConnection() with mongo: db = mongo.connection.HPNorton file_list = (product_file, customer_file) logging.debug('Successfully obtained file list') products = db['products'] customers = db['customers'] database_list = (products, customers) logging.debug('Got database list, going through files now') final_list = [] MP_list = [] pool = multiprocessing.pool.ThreadPool(processes=2) for filename, database in zip(file_list, database_list): logging.debug('Attempting to open %s/%s', directory_name, filename) MP = pool.apply_async(insert_data, (directory_name, filename, database)) MP_list.append(MP) list1 = MP_list[0] list2 = MP_list[1] final_list = [list1.get(), list2.get()] print(final_list) return final_list def import_data(directory_name, product_file, customer_file): """ Takes a directory name three csv files on input (product data, customer data, rentals data) and populates new mongo DB and returns two tuples (record count of number or products customers, rentals added) (second with count of number of errors occured)
def __init__(self, directory, image_data_generator, target_size=(256, 256), class_mode='binary', tags=(('satellite', 'jpg'), ('roadmap', 'png')), batch_size=32, shuffle=True, seed=None, data_format=None, save_to_dir=None, save_prefix='', save_format='png', subset=None, interpolation='nearest'): if data_format is None: data_format = K.image_data_format() self.directory = directory self.image_data_generator = image_data_generator self.target_size = tuple(target_size) if len(tags) != 2: raise ValueError('Invalid tags:', tags, '; expected tuple of two tuples.') if len(tags[0]) != 2 or len(tags[1]) != 2: raise ValueError('Invalid tags:', tags, '; expected tuples of two strings.') self.tags = tags self.data_format = data_format if self.data_format == 'channels_last': self.image_shape = self.target_size + (6, ) else: self.image_shape = (6, ) + self.target_size if class_mode not in {'binary', None}: raise ValueError('Invalid class_mode:', class_mode, '; expected one of "binary" or None.') self.class_mode = class_mode self.save_to_dir = save_to_dir self.save_prefix = save_prefix self.save_format = save_format self.interpolation = interpolation if subset is not None: validation_split = self.image_data_generator._validation_split if subset == 'validation': split = (0, validation_split) elif subset == 'training': split = (validation_split, 1) else: raise ValueError('Invalid subset name: ', subset, '; expected "training" or "validation"') else: split = None self.subset = subset # First, count the number of samples and classes. self.samples = 0 classes = [] for subdir in sorted(os.listdir(directory)): if os.path.isdir(os.path.join(directory, subdir)): classes.append(subdir) self.classes = classes self.num_classes = len(classes) self.class_indices = dict(zip(classes, range(len(classes)))) pool = multiprocessing.pool.ThreadPool() function_partial = partial(_count_valid_files_in_directory, tag=self.tags[0][0], split=split) self.samples = sum( pool.map(function_partial, (os.path.join(directory, subdir) for subdir in classes))) print('Found %d images belonging to %d classes.' % (self.samples, self.num_classes)) # Second, build an index of the images # in the different class subfolders. results = [] self.filenames = [] self.classes = np.zeros((self.samples, ), dtype='int32') i = 0 for dirpath in (os.path.join(directory, subdir) for subdir in classes): results.append( pool.apply_async( _list_valid_filenames_in_directory, (dirpath, split, self.class_indices, self.tags[0][0]))) for res in results: classes, filenames = res.get() self.classes[i:i + len(classes)] = classes self.filenames += filenames i += len(classes) pool.close() pool.join() super(DirectoryIterator, self).__init__(self.samples, batch_size, shuffle, seed)
def run_function_different_arguments_parallel(cls, function, arguments, all_success=False, signal=None, parallel=True, threads=0, *args, **kwargs): """ Call functions in parallel :param function: f(argument, **kwargs) :param arguments: {i: argument} :param all_success: (boolean) the function will raise an exception if one of the runs fail and all_success is True :param signal: (function) calls this function after generating the jobs. It's used to test KeyboardInterrupt, and the signal is a mock of KeyboardInterrupt. :param parallel: (boolean) The code is run in parallel only if it's True. :param threads: (int) Uses threads instead of processes if threads > 0 :param args: additional arguments of function :param kwargs: additional arguments of function :return: {int: output of f(arguments[i])} """ # Maybe later we enable this feature. #thread = False jobs = {} if not parallel: return cls.run_function_different_arguments_sequentially(function, arguments, *args, **kwargs) n_jobs = min(len(arguments), mp.cpu_count()) if threads > 0: pool = ThreadPool(threads) else: pool = mp.Pool(processes=n_jobs) try: for key, argument in arguments.iteritems(): job = pool.apply_async(function, args=(argument, ) + args, kwds=kwargs) jobs[key] = job pool.close() pool.join() if signal is not None: signal(1) except KeyboardInterrupt: logger.info("Ctrl+c received, terminating and joining pool.") pool.terminate() pool.join() return -1 results = {} for key in arguments.keys(): try: results[key] = jobs[key].get() except Exception as e: if all_success: raise e else: logger.info("job failed") logger.info(key) logger.info(argument) logger.info(args) logger.info(kwargs) return results
def __init__(self, directory, classes=None, number_subsequences=32, dim=(32, 32, 32), n_channels=6, n_classes=10, shuffle=True, n_samples=None, seed=None, faster=True, online_training=False, repeat=True, use_spacer=False, randomrepeat=False, sequence_length=50, full_seq_embedding=False, final_set=True, include_raptorx_iupred=False, include_dict_scores=False, non_binary=False, **kwargs): 'Initialization' self.directory = directory self.classes = classes self.dim = dim self.labels = None self.list_IDs = None self.n_channels = n_channels self.shuffle = shuffle self.seed = seed self.online_training = online_training self.repeat = repeat self.use_spacer = use_spacer self.randomrepeat = randomrepeat self.maxLen = kwargs.get("maxLen", None) self.sequence_length = sequence_length self.full_seq_embedding = full_seq_embedding self.final_set = final_set self.include_raptorx_iupred = include_raptorx_iupred self.include_dict_scores = include_dict_scores self.non_binary = non_binary if full_seq_embedding: file_format = 'pkl' else: file_format = 'csv' if number_subsequences == 1: self.shrink_timesteps = False else: self.shrink_timesteps = True self.number_subsequences = number_subsequences if faster == True: self.faster = 16 elif type(faster) == int and faster > 0: self.faster = faster else: self.faster = 1 self.number_samples_per_batch = self.faster self.number_samples_per_class_to_pick = n_samples if not classes: classes = [] for subdir in sorted(os.listdir(directory)): if os.path.isdir(os.path.join(directory, subdir)): classes.append(subdir) self.classes = classes self.n_classes = len(classes) self.class_indices = dict(zip(classes, range(len(classes)))) print(self.class_indices) # want a dict which contains dirs and number usable files pool = multiprocessing.pool.ThreadPool() function_partial = partial(_count_valid_files_in_directory, white_list_formats={file_format}, follow_links=None, split=None) self.samples = pool.map(function_partial, (os.path.join(directory, subdir) for subdir in classes)) self.samples = dict(zip(classes, self.samples)) results = [] for dirpath in (os.path.join(directory, subdir) for subdir in classes): results.append( pool.apply_async( utils._list_valid_filenames_in_directory, (dirpath, {file_format}, None, self.class_indices, None))) self.filename_dict = {} for res in results: classes, filenames = res.get() for index, class_i in enumerate(classes): self.filename_dict.update( {f"{class_i}_{index}": filenames[index]}) pool.close() pool.join() if not n_samples: self.number_samples_per_class_to_pick = min(self.samples.values()) self.elmo_embedder = Elmo_embedder() self.on_epoch_end()
def __init__(self, directory, image_data_generator, target_size=(256, 256), color_mode='rgb', classes=None, class_mode='categorical', batch_size=32, shuffle=True, seed=None, data_format=None, vector_length=512, dimension=256, save_to_dir=None, save_prefix='', save_format='png', follow_links=False): if data_format is None: data_format = K.image_data_format() self.directory = directory self.image_data_generator = image_data_generator self.target_size = tuple(target_size) self.vector_length = vector_length if color_mode not in {'rgb', 'grayscale'}: raise ValueError('Invalid color mode:', color_mode, '; expected "rgb" or "grayscale".') self.color_mode = color_mode self.data_format = data_format if self.color_mode == 'rgb': if self.data_format == 'channels_last': self.image_shape = self.target_size + (3,) else: self.image_shape = (3,) + self.target_size else: if self.data_format == 'channels_last': self.image_shape = self.target_size + (1,) else: self.image_shape = (1,) + self.target_size self.classes = classes self.dimension = dimension if class_mode not in {'categorical', 'binary', 'sparse', 'input', 'input_g_c', 'colorize', 'kl_divergence', None}: raise ValueError('Invalid class_mode:', class_mode, '; expected one of "categorical", ' '"binary", "sparse", "input"' ' or None.') self.class_mode = class_mode self.save_to_dir = save_to_dir self.save_prefix = save_prefix self.save_format = save_format white_list_formats = {'png', 'jpg', 'jpeg', 'bmp'} # first, count the number of samples and classes self.samples = 0 if not classes: classes = [] for subdir in sorted(os.listdir(directory)): if os.path.isdir(os.path.join(directory, subdir)): classes.append(subdir) self.num_class = len(classes) self.class_indices = dict(zip(classes, range(len(classes)))) def _recursive_list(subpath): return sorted(os.walk(subpath, followlinks=follow_links), key=lambda tpl: tpl[0]) pool = multiprocessing.pool.ThreadPool() function_partial = partial(_count_valid_files_in_directory, white_list_formats=white_list_formats, follow_links=follow_links) self.samples = sum(pool.map(function_partial, (os.path.join(directory, subdir) for subdir in classes))) print('Found %d images belonging to %d classes.' % (self.samples, self.num_class)) # second, build an index of the images in the different class subfolders results = [] self.filenames = [] self.classes = np.zeros((self.samples,), dtype='int32') i = 0 for dirpath in (os.path.join(directory, subdir) for subdir in classes): results.append(pool.apply_async(_list_valid_filenames_in_directory, (dirpath, white_list_formats, self.class_indices, follow_links))) for res in results: classes, filenames = res.get() self.classes[i:i + len(classes)] = classes self.filenames += filenames i += len(classes) pool.close() pool.join() super(DirectoryIterator, self).__init__(self.samples, batch_size, shuffle, seed)
def main(): # SYSTEM VARIABLES Strategies_to_test = [100] strategy_runs = 100 randomize = True verbose = True Market_history_file = './Data/Backtest_data/BacktestData_2020-21.xlsx' comission_pcnt = 0.02 # 2% Betfair comission initial_balance = 1100 min_bet = 0 #Minimum wager (e.g. Betfair exchange) max_bet = 500 #Maximum bet the market can take? (much higher for Sportsbook) f = 0.1 #percentile of balance to bet Fixed_bet_amount = round(initial_balance * 0.025) #for all fixed bet strategies saving = 0.0 #save percentagex100 of balance above initial balance # LOAD BACKTEST EXCEL DATA no_games, results, FULL_our_probs, FULL_our_prediction, FULL_market_odds,\ FULL_market_prediction, CARMELO, COVERS, ODDSHARK, H2H, ODDSHARK_LastN_Away, ODDSHARK_LastN_Home = LoadBackTestData(Market_history_file) # STRATEGIES EVALUATION #variables for each strategy num_of_strategies = len(Strategies_to_test) StratStats = [] running_stats = [] for strats in range(num_of_strategies): StratStats.append([]) running_stats.append([]) #run strategies multiple times if randomize is False: strategy_runs = 1 if strategy_runs > 1 or num_of_strategies > 1: verbose = False if verbose: plt.figure() ax = plt.axes() ax2 = ax.twinx( ) # instantiate a second axes that shares the same x-axis ax.set_xlabel('Bets') ax.set_ylabel('Balance', color='blue') ax.tick_params(axis='y', labelcolor='blue') else: ax = None ax2 = None #Single processing. Only for manual entries or for single runs if strategy_runs == 1: running_stats= StrategiesRun(Strategies_to_test,min_bet,max_bet,results,f,Fixed_bet_amount, \ FULL_our_probs, FULL_our_prediction, FULL_market_odds, FULL_market_prediction, \ initial_balance, no_games, randomize,verbose,comission_pcnt,ax,ax2, saving) else: #Multiprocessing cpus = 12 pool = multiprocessing.Pool(processes=cpus) results = [pool.apply_async(StrategiesRun, args=(Strategies_to_test,min_bet,max_bet,results,f,Fixed_bet_amount, \ FULL_our_probs, FULL_our_prediction, FULL_market_odds, FULL_market_prediction, \ initial_balance, no_games, randomize,verbose,comission_pcnt,ax,ax2, saving)) for i in range(strategy_runs)] pool.close() pool.join() #Gather the results for p in results: for strats in range(num_of_strategies): running_stats[strats].append(p.get()[strats][0]) #average stats over runs for strats in range(num_of_strategies): StratStats[strats] = average_Strategy_Stats(running_stats[strats]) #SAVE TO EXCEL workbook = openpyxl.Workbook() worksheet = workbook.worksheets[0] fields = dir(StratStats[0]) for strats in range(num_of_strategies): field_count = 1 worksheet.cell(0 + 1, strats + 1 + 1).value = str( StratStats[strats].StrategyName ) #Strategy name (Header). Cell indices start from 1 for i in range(1, len(fields)): if "__" not in fields[i]: #skip over internal fields of the struct worksheet.cell(field_count + 1, 0 + 1).value = str( fields[i]) #Field name (Header). Cell indices start from 1 exec( "worksheet.cell(field_count+1, strats+1+1).value= StratStats[strats]." + fields[i]) #Field data. Cell indices start from 1 field_count = field_count + 1 workbook.save("./Data/Backtest_data/Backtest_simulations.xlsx")
def wrapper(*args, **kw): pool = multiprocessing.pool.ThreadPool(processes=1) async_result = pool.apply_async(func, args, kw) return async_result.get(seconds)
def __init__(self, directory, sound_data_generator, target_size=(256, 256), classes=None, class_mode='categorical', batch_size=32, shuffle=True, seed=None, follow_links=False, subset=None, interpolation='nearest', dtype='float32'): super(DirectoryIterator, self).set_processing_attrs(sound_data_generator, target_size, subset) self.directory = directory self.classes = classes if class_mode not in self.allowed_class_modes: raise ValueError( 'Invalid class_mode: {}; expected one of: {}'.format( class_mode, self.allowed_class_modes)) self.class_mode = class_mode self.dtype = dtype self.samples = 0 if not classes: classes = [] for subdir in sorted(os.listdir(directory)): if os.path.isdir(os.path.join(directory, subdir)): classes.append(subdir) self.num_classes = len(classes) self.class_indices = dict(zip(classes, range(len(classes)))) pool = multiprocessing.pool.ThreadPool() results = [] self.filenames = [] i = 0 for dirpath in (os.path.join(directory, subdir) for subdir in classes): results.append( pool.apply_async(_list_valid_filenames_in_directory, (dirpath, self.white_list_formats, self.split, self.class_indices, follow_links))) classes_list = [] for res in results: classes, filenames = res.get() classes_list.append(classes) self.filenames += filenames self.samples = len(self.filenames) self.classes = np.zeros((self.samples, ), dtype='int32') for classes in classes_list: self.classes[i:i + len(classes)] = classes i += len(classes) print('Found %d wav files belonging to %d classes.' % (self.samples, self.num_classes)) pool.close() pool.join() self._filepaths = [ os.path.join(self.directory, fname) for fname in self.filenames ] super(DirectoryIterator, self).__init__(self.samples, batch_size, shuffle, seed)
def buildDecisionTree(df, root, file, config, dataset_features, parent_level=0, leaf_id=0, parents='root', tree_id=0, validation_df=None, main_process_id=None): models = [] decision_rules = [] feature_names = df.columns[0:-1] enableParallelism = config['enableParallelism'] algorithm = config['algorithm'] json_file = file.split(".")[0] + ".json" random_forest_enabled = config['enableRandomForest'] enableGBM = config['enableGBM'] enableAdaboost = config['enableAdaboost'] if root == 1: if random_forest_enabled != True and enableGBM != True and enableAdaboost != True: raw_df = df.copy() #-------------------------------------- df_copy = df.copy() winner_name, num_of_instances, metric, metric_name = findDecision( df, config) #find winner index, this cannot be returned by find decision because columns dropped in previous steps j = 0 for i in dataset_features: if i == winner_name: winner_index = j j = j + 1 numericColumn = False if dataset_features[winner_name] != 'object': numericColumn = True #restoration columns = df.shape[1] for i in range(0, columns - 1): #column_name = df.columns[i]; column_type = df[column_name].dtypes #numeric field already transformed to object. you cannot check it with df itself, you should check df_copy column_name = df_copy.columns[i] column_type = df_copy[column_name].dtypes if column_type != 'object' and column_name != winner_name: df[column_name] = df_copy[column_name] classes = df[winner_name].value_counts().keys().tolist() #print("classes: ",classes," in ", winner_name) #----------------------------------------------------- num_cores = config["num_cores"] input_params = [] #serial approach for i in range(0, len(classes)): current_class = classes[i] subdataset = df[df[winner_name] == current_class] subdataset = subdataset.drop(columns=[winner_name]) branch_index = i * 1 #create branches serially if enableParallelism != True: if i == 0: descriptor = { "feature": winner_name, "instances": num_of_instances, #"metric_name": metric_name, "metric_value": round(metric, 4), "depth": parent_level + 1 } descriptor = "# " + json.dumps(descriptor) functions.storeRule( file, (functions.formatRule(root), "", descriptor)) results = createBranch(config, current_class, subdataset, numericColumn, branch_index, winner_name, winner_index, root, parents, file, dataset_features, num_of_instances, metric, tree_id=tree_id, main_process_id=main_process_id) decision_rules = decision_rules + results else: input_params.append( (config, current_class, subdataset, numericColumn, branch_index, winner_name, winner_index, root, parents, file, dataset_features, num_of_instances, metric, tree_id, main_process_id)) #--------------------------- #add else condition in the decision tree if df.Decision.dtypes == 'object': #classification pivot = pd.DataFrame(subdataset.Decision.value_counts()).reset_index() pivot = pivot.rename(columns={ "Decision": "Instances", "index": "Decision" }) pivot = pivot.sort_values(by=["Instances"], ascending=False).reset_index() else_decision = "return '%s'" % (pivot.iloc[0].Decision) if enableParallelism != True: functions.storeRule(file, (functions.formatRule(root), "else:")) functions.storeRule( file, (functions.formatRule(root + 1), else_decision)) else: #parallelism leaf_id = str(uuid.uuid1()) check_rule = "else: " + else_decision sample_rule = {} sample_rule["current_level"] = root sample_rule["leaf_id"] = leaf_id sample_rule["parents"] = parents sample_rule["rule"] = check_rule sample_rule["feature_idx"] = -1 sample_rule["feature_name"] = "" sample_rule["instances"] = df.shape[0] sample_rule["metric"] = 0 sample_rule["return_statement"] = 0 sample_rule["tree_id"] = tree_id #json to string sample_rule = json.dumps(sample_rule) decision_rules.append(sample_rule) else: #regression else_decision = "return %s" % (subdataset.Decision.mean()) if enableParallelism != True: functions.storeRule(file, (functions.formatRule(root), "else:")) functions.storeRule( file, (functions.formatRule(root + 1), else_decision)) else: leaf_id = str(uuid.uuid1()) check_rule = "else: " + else_decision sample_rule = {} sample_rule["current_level"] = root sample_rule["leaf_id"] = leaf_id sample_rule["parents"] = parents sample_rule["rule"] = check_rule sample_rule["tree_id"] = tree_id sample_rule["feature_name"] = "" sample_rule["instances"] = 0 sample_rule["metric"] = 0 sample_rule["return_statement"] = 1 #json to string sample_rule = json.dumps(sample_rule) decision_rules.append(sample_rule) #--------------------------- try: main_process = psutil.Process(main_process_id) children = main_process.children(recursive=True) active_processes = len(children) + 1 #plus parent #active_processes = len(children) except: active_processes = 100 #set a large initial value results = [] #create branches in parallel if enableParallelism == True: required_threads = active_processes + len(classes) #if parent_level == 0 and random_forest_enabled != True: if main_process_id != None and num_cores >= required_threads: #len(classes) branches will be run in parallel #POOL_SIZE = num_cores POOL_SIZE = len(classes) #with closing(multiprocessing.Pool(POOL_SIZE)) as pool: with closing(MyPool(POOL_SIZE)) as pool: funclist = [] for input_param in input_params: f = pool.apply_async(createBranchWrapper, [createBranch, input_param]) funclist.append(f) #all functions registered here for f in funclist: branch_results = f.get(timeout=100000) for branch_result in branch_results: results.append(branch_result) pool.close() pool.terminate() #-------------------------------- else: #serial for input_param in input_params: sub_results = createBranchWrapper(createBranch, input_param) for sub_result in sub_results: results.append(sub_result) #-------------------------------- decision_rules = decision_rules + results #-------------------------------- if root != 1: #return children results until the root node return decision_rules #--------------------------------------------- if root == 1: if enableParallelism == True: #custom rules are stored in decision_rules. merge them all in a json file first json_rules = "[\n" #initialize file_index = 0 for custom_rule in decision_rules: json_rules += custom_rule if file_index < len(decision_rules) - 1: json_rules += ", " json_rules += "\n" file_index = file_index + 1 #----------------------------------- json_rules += "]" functions.createFile(json_file, json_rules) #----------------------------------- #reconstruct rules from json to py reconstructRules(json_file, feature_names) #----------------------------------- #is regular decision tree if config['enableRandomForest'] != True and config[ 'enableGBM'] != True and config['enableAdaboost'] != True: #this is reguler decision tree. find accuracy here. moduleName = "outputs/rules/rules" fp, pathname, description = imp.find_module(moduleName) myrules = imp.load_module(moduleName, fp, pathname, description) #rules0 models.append(myrules) return models
def __init__(self, directory, image_data_generator, target_size=(256, 256), batch_size=32, shuffle=True, seed=None, data_format=None, save_to_dir=None, save_prefix='', save_format='png', follow_links=False): if data_format is None: data_format = K.image_data_format() self.directory = directory self.image_data_generator = image_data_generator self.target_size = tuple(target_size) # density maps are always grayscale self.color_mode = 'grayscale' self.data_format = data_format if self.data_format == 'channels_last': self.image_shape = self.target_size + (1,) else: self.image_shape = (1,) + self.target_size # class mode is always None self.class_mode = None self.save_to_dir = save_to_dir self.save_prefix = save_prefix self.save_format = save_format # density maps are stores as npy files white_list_formats = {'npy'} # first, count the number of samples and classes self.samples = 0 classes = [] for subdir in sorted(os.listdir(directory)): if os.path.isdir(os.path.join(directory, subdir)): classes.append(subdir) self.num_class = len(classes) self.class_indices = dict(zip(classes, range(len(classes)))) def _recursive_list(subpath): return sorted(os.walk(subpath, followlinks=follow_links), key=lambda tpl: tpl[0]) pool = multiprocessing.pool.ThreadPool() function_partial = partial(_count_valid_files_in_directory, white_list_formats=white_list_formats, follow_links=follow_links) self.samples = sum(pool.map(function_partial, (os.path.join(directory, subdir) for subdir in classes))) print('Found %d density maps.' % self.samples) # second, build an index of the images in the different class subfolders results = [] self.filenames = [] self.classes = np.zeros((self.samples,), dtype='int32') i = 0 for dirpath in (os.path.join(directory, subdir) for subdir in classes): results.append(pool.apply_async(_list_valid_filenames_in_directory, (dirpath, white_list_formats, self.class_indices, follow_links))) for res in results: classes, filenames = res.get() self.classes[i:i + len(classes)] = classes self.filenames += filenames i += len(classes) pool.close() pool.join() super(NpyDirectoryIterator, self).__init__(self.samples, batch_size, shuffle, seed)
designspaceItalic = fontTools.designspaceLib.DesignSpaceDocument.fromfile( INPUT_DIR / "CascadiaCode_variable_italic.designspace") designspaceItalic.instances = [ s for s in designspaceItalic.instances if s.lib.get("com.schriftgestaltung.export", True) ] # Stage 1: Make all the things. pool = multiprocessing.pool.Pool(processes=multiprocessing.cpu_count()) processes = [] processes.append( pool.apply_async( build_font_variable, ( designspace, "Cascadia Code", args.vtt_compile, ), )) if args.italic: processes.append( pool.apply_async( build_font_variable, ( designspaceItalic, "Cascadia Code Italic", args.vtt_compile, ), )) if args.mono: processes.append(
'batch_%05d_%05d_%d' % (frameNumbers[0], frameNumbers[-1], options.bundleLength)) thisOutputFolder = os.path.join(outputFolder, batchFolderName) if not options.logBatches: logger.info('Running processing batch in output folder: ' + thisOutputFolder + '\n' + 'with options: ' + extraOptions + ' --stereo-arguments ' + options.stereoArgs) if not options.dryRun: # Generate the command call taskHandles.append( pool.apply_async( processBatch, (batchImageCameraPairs, lidarFolder, options.referenceDem, thisSkipInterval, thisOutputFolder, extraOptions, outputResolution, options.stereoArgs, batchNum, batchLogPath))) batchNum += 1 # Reset these lists batchImageCameraPairs = [] frameNumbers = [] # Advance to the frame that starts the next batch if hitBreakFrame: # When we hit a break in the frames we need to start the # next batch after the break frame i = j + 1 else: # Start in the next frame that was not used as a "left" stereo image.
def _generate_descriptor_matrices(self, data_set, **kwargs): """ Generate info and descriptor matrices based on ingest type. :param data_set: Iterable of data elements to generate combined info and descriptor matrices for. :type item_iter: collections.Set[smqtk.representation.DataElement] :param limit: Limit the number of descriptor entries to this amount. :type limit: int :return: Combined info and descriptor matrices for all base images :rtype: (numpy.core.multiarray.ndarray, numpy.core.multiarray.ndarray) """ if not data_set: raise ValueError("No data given to process.") inf = float('inf') descriptor_limit = kwargs.get('limit', inf) per_item_limit = numpy.floor(float(descriptor_limit) / len(data_set)) if len(data_set) == 1: # because an iterable doesn't necessarily have a next() method di = iter(data_set).next() # Check for checkpoint files info_fp, desc_fp = \ self._get_standard_info_descriptors_filepath(di) # Save out data bytes to temporary file temp_img_filepath = self._get_data_temp_path(di) try: # Generate descriptors utils.generate_descriptors(self.EXE, temp_img_filepath, self.descriptor_type(), info_fp, desc_fp, per_item_limit) finally: # clean temp file di.clean_temp() return numpy.load(info_fp), numpy.load(desc_fp) else: # compute and V-stack matrices for all given images pool = multiprocessing.Pool(processes=self.parallel) # Mapping of UID to tuple containing: # (info_fp, desc_fp, async processing result, tmp_clean_method) r_map = {} with SimpleTimer("Computing descriptors async...", self._log.debug): for di in data_set: # Creating temporary image file from data bytes tmp_img_fp = self._get_data_temp_path(di) info_fp, desc_fp = \ self._get_standard_info_descriptors_filepath(di) args = (self.EXE, tmp_img_fp, self.descriptor_type(), info_fp, desc_fp) r = pool.apply_async(utils.generate_descriptors, args) r_map[di.uuid()] = (info_fp, desc_fp, r, di.clean_temp) pool.close() # Pass through results from descriptor generation, aggregating # matrix shapes. # - Transforms r_map into: # UID -> (info_fp, desc_fp, starting_row, SubSampleIndices) self._log.debug("Constructing information for super matrices...") s_keys = sorted(r_map.keys()) running_height = 0 # info and desc heights congruent i_width = None d_width = None for uid in s_keys: ifp, dfp, r, tmp_clean_method = r_map[uid] # descriptor generation may have failed for this ingest UID try: i_shape, d_shape = r.get() except RuntimeError, ex: self._log.warning( "Descriptor generation failed for " "UID[%s], skipping its inclusion in " "model: %s", uid, str(ex)) r_map[uid] = None continue finally: # Done with image file, so remove from filesystem tmp_clean_method()
for div in divs: post = get_post(div) if post: queue.put(post) def get_queue_contents(queue): def get_queue(): try: return queue.get_nowait() except: return None return [x for x in iter(get_queue, None)] if __name__ == '__main__': urls = ['http://www.holycool.net/page/%d' % x for x in range(1, 128)] queue = Queue.Queue() lock = multiprocessing.Lock() pool = multiprocessing.pool.ThreadPool(60) for url in urls: pool.apply_async(fetch_posts, args=(url, queue, lock)) pool.close() pool.join() posts = get_queue_contents(queue) with open('scraped.txt', 'w') as f: f.write(str(posts))
def _generate_descriptor_matrices(self, data_set, **kwargs): """ Generate info and descriptor matrices based on ingest type. :param data_set: Iterable of data elements to generate combined info and descriptor matrices for. :type item_iter: collections.Set[smqtk.representation.DataElement] :param limit: Limit the number of descriptor entries to this amount. :type limit: int :return: Combined info and descriptor matrices for all base images :rtype: (numpy.core.multiarray.ndarray, numpy.core.multiarray.ndarray) """ descriptor_limit = kwargs.get('limit', float('inf')) # With videos, an "item" is one video, so, collect for a while video # as normal, then subsample from the full video collection. per_item_limit = numpy.floor(float(descriptor_limit) / len(data_set)) # If an odd number of jobs, favor descriptor extraction if self.parallel: descr_parallel = int(max(1, math.ceil(self.parallel / 2.0))) extract_parallel = int(max(1, math.floor(self.parallel / 2.0))) else: cpuc = multiprocessing.cpu_count() descr_parallel = int(max(1, math.ceil(cpuc / 2.0))) extract_parallel = int(max(1, math.floor(cpuc / 2.0))) # For each video, extract frames and submit colorDescriptor processing # jobs for each frame, combining all results into a single matrix for # return. pool = multiprocessing.Pool(processes=descr_parallel) # Mapping of [UID] to [frame] to tuple containing: # (info_fp, desc_fp, async processing result) r_map = {} with SimpleTimer("Extracting frames and submitting descriptor jobs...", self._log.debug): for di in data_set: r_map[di.uuid()] = {} tmp_vid_fp = self._get_data_temp_path(di) p = dict(self.FRAME_EXTRACTION_PARAMS) vmd = get_metadata_info(tmp_vid_fp) p['second_offset'] = vmd.duration * p['second_offset'] p['max_duration'] = vmd.duration * p['max_duration'] fm = video_utils.ffmpeg_extract_frame_map( self._work_dir, tmp_vid_fp, parallel=extract_parallel, **p) # Compute descriptors for extracted frames. for frame, imgPath in fm.iteritems(): info_fp, desc_fp = \ self._get_standard_info_descriptors_filepath(di, frame) r = pool.apply_async(utils.generate_descriptors, args=(self.EXE, imgPath, self.descriptor_type(), info_fp, desc_fp)) r_map[di.uuid()][frame] = (info_fp, desc_fp, r) # Clean temporary video file file while computing descriptors # This does not remove the extracted frames that the underlying # detector/descriptor is working on. di.clean_temp() pool.close() # Each result is a tuple of two ndarrays: info and descriptor matrices with SimpleTimer("Collecting shape information for super matrices...", self._log.debug): running_height = 0 i_width = None d_width = None # Transform r_map[uid] into: # (info_mat_files, desc_mat_files, sR, ssi_list) # -> files in frame order uids = sorted(r_map) for uid in uids: video_num_desc = 0 video_info_mat_fps = [] # ordered list of frame info mat files video_desc_mat_fps = [] # ordered list of frame desc mat files for frame in sorted(r_map[uid]): ifp, dfp, r = r_map[uid][frame] # Descriptor generation may have failed for this UID try: i_shape, d_shape = r.get() except RuntimeError, ex: self._log.warning( 'Descriptor generation failed for ' 'frame %d in video UID[%s]: %s', frame, uid, str(ex)) r_map[uid] = None continue if d_width is None and d_shape[0] != 0: i_width = i_shape[1] d_width = d_shape[1] # Skip if there were no descriptors generated for this # frame if d_shape[1] == 0: continue video_info_mat_fps.append(ifp) video_desc_mat_fps.append(dfp) video_num_desc += d_shape[0] # If combined descriptor height exceeds the per-item limit, # generate a random subsample index list ssi = None if video_num_desc > per_item_limit: ssi = sorted( numpy.random.permutation(video_num_desc) [:per_item_limit]) video_num_desc = len(ssi) r_map[uid] = (video_info_mat_fps, video_desc_mat_fps, running_height, ssi) running_height += video_num_desc
def __init__(self, directory, classes=None, number_subsequences=32, dim=(32, 32, 32), n_channels=6, n_classes=10, shuffle=True, n_samples=None, seed=None, faster=True, online_training=False): 'Initialization' self.directory = directory self.classes = classes self.dim = dim self.labels = None self.list_IDs = None self.n_channels = n_channels self.shuffle = shuffle self.seed = seed self.online_training = online_training if number_subsequences == 1: self.shrink_timesteps = False else: self.shrink_timesteps = True self.number_subsequences = number_subsequences if faster == True: self.faster = 16 elif type(faster) == int and faster > 0: self.faster = faster else: self.faster = 1 self.number_samples_per_batch = self.faster self.number_samples_per_class_to_pick = n_samples if not classes: classes = [] for subdir in sorted(os.listdir(directory)): if os.path.isdir(os.path.join(directory, subdir)): classes.append(subdir) self.classes = classes self.n_classes = len(classes) self.class_indices = dict(zip(classes, range(len(classes)))) # want a dict which contains dirs and number usable files pool = multiprocessing.pool.ThreadPool() function_partial = partial(image._count_valid_files_in_directory, white_list_formats={'csv'}, follow_links=None, split=None) self.samples = pool.map(function_partial, (os.path.join(directory, subdir) for subdir in classes)) self.samples = dict(zip(classes, self.samples)) results = [] for dirpath in (os.path.join(directory, subdir) for subdir in classes): results.append( pool.apply_async( image._list_valid_filenames_in_directory, (dirpath, {'csv'}, None, self.class_indices, None))) self.filename_dict = {} for res in results: classes, filenames = res.get() for index, class_i in enumerate(classes): self.filename_dict.update( {f"{class_i}_{index}": filenames[index]}) pool.close() pool.join() if not n_samples: self.number_samples_per_class_to_pick = min(self.samples.values()) self.on_epoch_end()
def run_calls(fun, list_of_args, extra_args=(), pool_type='processes', nb_workers=multiprocessing.cpu_count(), timeout=60, verbose=True, initializer=None, initargs=None): """ Run a function several times in parallel with different inputs. Args: fun: function to be called several times in parallel. list_of_args: list of (first positional) arguments passed to fun, one per call extra_args: tuple containing extra arguments to be passed to fun (same value for all calls) pool_type: either 'processes' or 'threads' nb_workers: number of calls run simultaneously timeout: number of seconds allowed per function call verbose: either True (show the amount of computed calls) or False initializer, initargs (optional): if initializer is not None then each worker process will call initializer(*initargs) when it starts Return: list of outputs """ if pool_type == 'processes': pool = multiprocessing.Pool(nb_workers, initializer, initargs) elif pool_type == 'threads': pool = multiprocessing.pool.ThreadPool(nb_workers) else: raise ValueError("unknow pool_type {}".format(pool_type)) results = [] outputs = [] with contextlib.ExitStack() as stack: if verbose: bar = stack.enter_context(tqdm(total=len(list_of_args))) for x in list_of_args: if type(x) == tuple: args = x + extra_args else: args = (x, ) + extra_args results.append( pool.apply_async(fun, args=args, callback=lambda x: bar.update(1) if verbose else None)) for r in results: try: outputs.append(r.get(timeout)) except KeyboardInterrupt: pool.terminate() sys.exit(1) pool.close() pool.join() return outputs
def runFBA_multi(inputTar, gem_sbml, outputTar, sim_type, source_reaction, target_reaction, source_coefficient, target_coefficient, is_max, fraction_of, dont_merge=True, num_workers=10, pathway_id='rp_pathway', objective_id=None, compartment_id='MNXC3', fill_orphan_species=False, species_group_id='central_species', sink_species_group_id='rp_sink_species'): """Subprocess implementation of rpFBA :param inputTar: Path of the TAR rpSBML files :param gem_sbml: Path to the GEM file :param outputTar: Path of the TAR output :param sim_type: The type of simulation to use. Available simulation types include: fraction, fba, rpfba :param source_reaction: The reaction id of the source reaction. :param target_reaction: The reaction id of the target reaction. Note that if fba or rpfba options are used, then these are ignored :param source_coefficient: The source coefficient :param target_coefficient: The target coefficient :param is_max: Maximise or minimise the objective :param fraction_of: The fraction of the optimum. Note that this value is ignored is fba is used :param dont_merge: Output the merged model (Default: True) :param num_workers: The number of processes to use (Default: 10) :param pathway_id: The id of the heterologous pathway (Default: rp_pathway) :param objective_id: Overwrite the auto-generated id of the results (Default: None) :param compartment_id: The SBML compartment id (Default: MNXC3) :param fill_orphan_species: Add pseudo reactions that consume/produce single parent species. Note in development :param species_group_id: The id of the central species (Default: central_species) :param sink_species_group_id: The id of the sink species (Default: rp_sink_species) :type inputTar: str :type gem_sbml: str :type outputTar: str :type sim_type: str :type source_reaction: str :type target_reaction: str :type source_coefficient: float :type target_coefficient: float :type is_max: bool :type fraction_of: float :type dont_merge: bool :type num_workers: int :type pathway_id: str :type objective_id: str :type compartment_id: str :type fill_orphan_species: bool :type species_group_id: str :type sink_species_group_id: str :return: Succcess or failure of the function :rtype: bool """ with tempfile.TemporaryDirectory() as tmpOutputFolder: with tempfile.TemporaryDirectory() as tmpInputFolder: tar = tarfile.open(inputTar, mode='r') tar.extractall(path=tmpInputFolder) tar.close() if len(glob.glob(tmpInputFolder + '/*')) == 0: logging.error('Input file is empty') return False #HERE SPECIFY THE NUMBER OF CORES pool = nonDeamonicPool(processes=num_workers) results = [] for sbml_path in glob.glob(tmpInputFolder + '/*'): file_name = sbml_path.split('/')[-1].replace( '.sbml', '').replace('.xml', '').replace('.rpsbml', '') results.append( pool.apply_async(singleFBA_hdd, args=( file_name, sbml_path, gem_sbml, sim_type, source_reaction, target_reaction, source_coefficient, target_coefficient, is_max, fraction_of, tmpOutputFolder, dont_merge, pathway_id, objective_id, compartment_id, fill_orphan_species, species_group_id, sink_species_group_id, ))) output = [p.get() for p in results] pool.close() pool.join() if len(glob.glob(tmpOutputFolder + '/*')) == 0: logging.error('rpFBA has not produced any results') return False with tarfile.open(outputTar, mode='w:gz') as ot: for sbml_path in glob.glob(tmpOutputFolder + '/*'): file_name = str( sbml_path.split('/')[-1].replace('.sbml', '').replace( '.xml', '').replace('.rpsbml', '')) + '.sbml.xml' info = tarfile.TarInfo(file_name) info.size = os.path.getsize(sbml_path) ot.addfile(tarinfo=info, fileobj=open(sbml_path, 'rb')) return True
def generate( distros=None, version=None, jobs=None, publish_under=None, generate_docker_tarball=False, generate_distro_specific_sct_tarball=False, build_options=[], proxy=False, ): """ """ if distros is None: distros = default_distros if version is None: version = default_version logger.info("Generating distro Dockerfiles") names = [] for distro in distros: name = "sct-{}-{}".format(version, distro.replace(":", "-")).lower() logger.info("- %s...", name) lock = threading.Lock( ) # prevent building official simultaneously to alias name = sct_docker.generate( distro=distro, version=version, name=name, commands=default_commands, install_fsleyes=True, install_tools=True, install_python=True, #install_fsl=True, configure_ssh=True, verbose=False, proxy=proxy, ) names.append((name, lock)) if distro == "official": name = "sct-{}-{}".format(version, "official").lower() logger.info("- %s...", name) name = sct_docker.generate( distro=official_distro, version=version, name=name, commands=default_commands, install_fsleyes=True, #install_fsl=True, configure_ssh=True, verbose=False, proxy=proxy, ) names.append((name, lock)) logger.info("Done generating distro Dockerfiles") logger.info("Building images") if not check_exe("docker"): raise RuntimeError( "You might want to have docker available when running this tool") pool = multiprocessing.pool.ThreadPool(jobs) try: res = list() for name, lock in names: cmd = [ "docker", "build", "-t", name, name, ] + build_options def docker_build(cmd, lock): with lock: return subprocess.call(cmd) promise = pool.apply_async(docker_build, (cmd, lock)) res.append(promise) errs = list() for (name, _), promise in zip(names, res): err = promise.get() if err != 0: logger.error("{} failed with error code {}".format(name, err)) errs.append(err) pool.close() finally: pool.terminate() pool.join() logger.info("Done building images") failed = False for (name, _), err in zip(names, errs): if err == 0: logger.info("{} finished successfully".format(name)) else: logger.error("{} failed with error code {}".format(name, err)) failed = True if failed: logger.error("Not proceeding further as one distro failed: %s", errs) raise RuntimeError("Failed generating one distro") if proxy: return if publish_under: logger.info("Publishing on Docker hub") for name, _ in names: logger.info("- %s...", name) cmd = ["docker", "tag", name, "{}:{}".format(publish_under, name)] subprocess.call(cmd) cmd = ["docker", "push", "{}:{}".format(publish_under, name)] subprocess.call(cmd) logger.info("Done publishing") if generate_docker_tarball: logger.info("Generating Docker tarballs") for name, _ in names: logger.info("- %s...", name) cmd = ["bash", "-c", "docker save {}" \ " | xz --threads=0 --best > {}-docker.tar.xz".format(name, name)] subprocess.call(cmd) logger.info("Done generating Docker tarballs") if generate_distro_specific_sct_tarball: logger.info("Generating offline archives") if not (check_exe("xz") and check_exe("bash")): raise RuntimeError( "You might want to have bash & xz available when running this tool" ) for name, _ in names: logger.info("- %s...", name) cmd = ["bash", "-c", "docker run --log-driver=none --entrypoint /bin/sh {} -c 'cd /home/sct; tar c sct_*'" \ " | xz --threads=0 --best > {}-offline.tar.xz".format(name, name)] subprocess.call(cmd) logger.info("Done generating offline archives")
def Classification(MalwareCorpus, GoodwareCorpus, MaltestCorpus, GoodtestCorpus, Extn): if 'datatxt' in Extn: Type = 'Drebin' elif 'WL2' in Extn: Type = 'WLK' elif '_pkg_adicfg_ret_.json.ADG.DirWLWODup' in Extn: Type = 'CWLK' else: Type = 'Other' # step 1 - split all samples to training set and test set logger.debug("Loading positive and negative samples file basename") TrainMalSamples = GetFilesWithExtn(MalwareCorpus, Extn) TrainGoodSamples = GetFilesWithExtn(GoodwareCorpus, Extn)[:len(TrainMalSamples)] TestMalSamples = GetFilesWithExtn(MaltestCorpus, Extn) TestGoodSamples = GetFilesWithExtn(GoodtestCorpus, Extn)[:len(TestMalSamples)] logger.info("All Samples loaded") print '# mal train samples:', len(TrainMalSamples) print '# good train samples:', len(TrainGoodSamples) print '# mal test samples:', len(TestMalSamples) print '# good test samples:', len(TestGoodSamples) logger.info("Training and test sets split finished") TrainMalLabels = np.ones(len(TrainMalSamples)).tolist() TestMalLabels = np.ones(len(TestMalSamples)).tolist() TrainGoodLabels = np.empty(len(TrainGoodSamples)) TrainGoodLabels.fill(-1) TrainGoodLabels = TrainGoodLabels.tolist() TestGoodLabels = np.ones(len(TestGoodSamples)) TestGoodLabels.fill(-1) TestGoodLabels = TestGoodLabels.tolist() logger.info("All labels created") TrainSamples = TrainMalSamples + TrainGoodSamples TestSamples = TestMalSamples + TestGoodSamples TrainLabels = TrainMalLabels + TrainGoodLabels TestLabels = TestMalLabels + TestGoodLabels NumTestMalSamples = len(TestMalLabels) logger.info("All Samples loaded into training and testing sets") print "# Train Samples", len(TrainSamples) print "# Train Labels", len(TrainLabels) print "# Test Samples", len(TestSamples) print "# Test Labels", len(TestLabels) # step 2 - feature extracting TFIDFTransformer = TfidfTransformer() NewLineCVetorizer = CountVectorizer(input=u'filename', lowercase=True, token_pattern=None, tokenizer=NewLineTokenizer, dtype=np.float64) print 'performing count vectorizing' TrainDocsTermsFVs = NewLineCVetorizer.fit_transform(TrainSamples) TestDocsTermsFVs = NewLineCVetorizer.transform(TestSamples) print 'performing tf-idf vectorizing' TrainFVs = TFIDFTransformer.fit_transform(TrainDocsTermsFVs) TestFVs = TFIDFTransformer.transform(TestDocsTermsFVs) print 'train term-doc matrix: ', TrainFVs.shape #rowsx cols, rows = docs, cols = features/terms print 'test term-doc matrix: ', TestFVs.shape # step 3 - classification logger.info("Performing Cross Validation") EtaList = [0, 0.1, 0.3, 0.5, 0.7, 0.9, 1] CWAccuracyList = [] CList = [0.001, 0.01, 0.1, 1, 10, 100, 1000] AROWAccuracyList = [] pool = MyPool(4) a = [ pool.apply_async(GridSearchCV, ( MCWVarDiag, 5, e, len(TrainSamples), TrainFVs, TrainLabels, )) for e in EtaList ] CWAccuracyList = [res.get() for res in a] EtaBest = EtaList[CWAccuracyList.index(max(CWAccuracyList))] BestModel_CW = MCWVarDiag(EtaBest, epochs=50) a = [ pool.apply_async(GridSearchCV, args=( ArowDiag, 5, c, len(TrainSamples), TrainFVs, TrainLabels, )) for c in CList ] AROWAccuracyList = [res.get() for res in a] CBest = CList[AROWAccuracyList.index(max(AROWAccuracyList))] BestModel_AROW = ArowDiag(CBest, n_iters=50) pool.close() pool.join() print 'best model', BestModel_CW, max(CWAccuracyList) print 'best model', BestModel_AROW, max(AROWAccuracyList) logger.info("Applying Best Model on Testing Set") modeldict = {BestModel_CW: 'CW', BestModel_AROW: 'AROW'} for Model in [BestModel_CW, BestModel_AROW]: T0 = time() f = open(modeldict[Model] + '_' + Type + '.txt', 'w') f1 = open(modeldict[Model] + '_' + Type + '_Metadata.txt', 'w') Model.fit(TrainFVs, TrainLabels) PredictedLabels = [] NewTestLabels = [] i = 0 for TestFV, TestLabel in zip(TestFVs, TestLabels): #Mal Sample if i < NumTestMalSamples: TestMalLabel = np.array([TestLabel]) PredictedLabel = Model.predict(TestFV) PredictedLabels.append(float(PredictedLabel)) NewTestLabels.append(TestLabel) if float(PredictedLabel) != TestLabel: try: Model.partial_fit(TestFV, TestLabel) #update the model logger.info("Model Partially Fitted") except: logger.error("Partially Fitted Failed") pass PredictedMalLabel = np.array([float(PredictedLabel)]) print >> f1, (metrics.classification_report( TestMalLabel, PredictedMalLabel, target_names=['Sample', 'Sample'])) print >> f1, "Zero-one classification loss:", metrics.zero_one_loss( TestMalLabel, PredictedMalLabel) print >> f1, '-' * 100 #Ben Sample if NumTestMalSamples + i < len(TestLabels): TestLabel = TestLabels[NumTestMalSamples + i] TestFV = TestFVs[NumTestMalSamples + i] TestGoodLabel = np.array([TestLabel]) PredictedLabel2 = Model.predict(TestFV) PredictedLabels.append(float(PredictedLabel2)) NewTestLabels.append(TestLabel) if float(PredictedLabel2) != TestLabel: try: Model.partial_fit(TestFVs[NumTestMalSamples + i], TestLabel) #update the model logger.info("Model Partially Fitted") except: logger.error("Partially Fitted Failed") pass PredictedGoodLabel = np.array([float(PredictedLabel2)]) print >> f1, (metrics.classification_report( TestGoodLabel, PredictedGoodLabel, target_names=['Sample', 'Sample'])) print >> f1, "Zero-one classification loss:", metrics.zero_one_loss( TestGoodLabel, PredictedGoodLabel) print >> f1, '-' * 100 i += 1 if modeldict[Model] == 'CW': print >> f, 'Best Eta parameter', EtaBest elif modeldict[Model] == 'AROW': print >> f, 'Best C parameter', CBest print >> f, '-' * 100 print >> f, '-' * 43 + 'Whole Database' + '-' * 43 Accuracy = metrics.accuracy_score(PredictedLabels, NewTestLabels) print >> f, "Test Set Accuracy = ", Accuracy print >> f, 'testing time', time() - T0 print >> f, (metrics.classification_report( NewTestLabels, PredictedLabels, target_names=['Goodware', 'Malware'])) # raw_input() print >> f, 'Classifier Top Features' print >> f, '-' * 100 Vocab = NewLineCVetorizer.get_feature_names() try: FeautureImportances = Model.model["mu"][1.0].toarray()[0][:-1] except: FeautureImportances = Model.model["mu"].toarray()[0] TopFeatureIndices = FeautureImportances.argsort()[-100:][::-1] for FIndex in TopFeatureIndices: print >> f, Vocab[FIndex], FeautureImportances[FIndex] print >> f, '-' * 100 print >> f, 'before deleting rows TestFVs.shape', TestFVs.shape for i in xrange(len(TestSamples)): if -1 == TestLabels[i]: TestFVss = TestFVs[:i, :] break print >> f, 'after deleting rows TestFVs.shape', TestFVss.shape FeatureImportancesSparseArray = ssp.lil_matrix( (TestFVss.shape[1], TestFVss.shape[1])) FeatureImportancesSparseArray.setdiag(FeautureImportances) AllFVsTimesW = TestFVss * FeatureImportancesSparseArray print >> f, '-' * 100 AvgFV = AllFVsTimesW.mean(axis=0) AvgFV = AvgFV.view(dtype=np.float64).reshape(AvgFV.shape[1], -1) AvgFV = np.array(AvgFV).reshape(-1, ) TopRes = AvgFV.argsort()[-100:][::-1] print >> f, 'Top Feats of Test Positive Vector * Feature Importance Vector' for Sindex in TopRes: print >> f, Vocab[Sindex], AvgFV[Sindex] print >> f, '-' * 100
def __init__(self, directory, image_data_generator, target_size=(256, 256), color_mode='rgb', classes=None, class_mode='categorical', batch_size=32, shuffle=True, seed=None, data_format='channels_last', save_to_dir=None, save_prefix='', save_format='png', follow_links=False, subset=None, interpolation='nearest', dtype='float32'): super(DirectoryIterator, self).set_processing_attrs(image_data_generator, target_size, color_mode, data_format, save_to_dir, save_prefix, save_format, subset, interpolation) self.directory = directory self.classes = classes if class_mode not in self.allowed_class_modes: raise ValueError('Invalid class_mode: {}; expected one of: {}' .format(class_mode, self.allowed_class_modes)) self.class_mode = class_mode self.dtype = dtype # First, count the number of samples and classes. self.samples = 0 if not classes: classes = [] for subdir in sorted(os.listdir(directory)): if os.path.isdir(os.path.join(directory, subdir)): classes.append(subdir) self.num_classes = len(classes) self.class_indices = dict(zip(classes, range(len(classes)))) pool = multiprocessing.pool.ThreadPool() # Second, build an index of the images # in the different class subfolders. results = [] self.filenames = [] i = 0 for dirpath in (os.path.join(directory, subdir) for subdir in classes): results.append( pool.apply_async(_list_valid_filenames_in_directory, (dirpath, self.white_list_formats, self.split, self.class_indices, follow_links))) classes_list = [] for res in results: classes, filenames = res.get() classes_list.append(classes) self.filenames += filenames self.samples = len(self.filenames) self.classes = np.zeros((self.samples,), dtype='int32') for classes in classes_list: self.classes[i:i + len(classes)] = classes i += len(classes) print('Found %d images belonging to %d classes.' % (self.samples, self.num_classes)) pool.close() pool.join() super(DirectoryIterator, self).__init__(self.samples, batch_size, shuffle, seed)
def run_function_different_arguments_parallel(function, arguments, parallel=True, all_success=True, signal=None, use_thread=False, *args, **kwargs): """ Call functions in parallel :param function: f(argument, **kwargs) :param arguments: {i: argument} :param all_success: (boolean) the function will raise an exception if one of the runs fail and all_success is True :param signal: (function) calls this function after generating the jobs. It's used to test KeyboardInterrupt, and the signal is a mock of KeyboardInterrupt. :param parallel: (boolean) The code is run in parallel only if it's True. :param threads: (int) Uses threads instead of processes if threads > 0 :param args: additional arguments of function :param kwargs: additional arguments of function :return: {int: output of f(arguments[i])} """ # Maybe later we enable this feature. #thread = False if not parallel: results = {} for key, argument in arguments.items(): _args = (argument, ) + args results[key] = function(*_args, **kwargs) return results else: jobs = {} n_jobs = min(len(arguments), mp.cpu_count()) if use_thread: threads = len(arguments) pool = ThreadPool(threads) else: pool = mp.Pool(processes=n_jobs) try: for key, argument in arguments.items(): job = pool.apply_async(function, args=(argument, ) + args, kwds=kwargs) jobs[key] = job pool.close() pool.join() if signal is not None: signal(1) except KeyboardInterrupt: print("Ctrl+c received, terminating and joining pool.") pool.terminate() pool.join() return -1 results = {} n_retry = 5 for key in arguments.keys(): for count in range(n_retry): # retry 5 times before raise error. try: results[key] = jobs[key].get() break except Exception as e: # if all_success: # raise e if count == n_retry - 1: raise e else: print("job failed") print(argument) print(e) print(args) print(kwargs) print('Retrying ...') return results
def func_wrapper(*args, **kwargs): """Closure for function.""" pool = multiprocessing.pool.ThreadPool(processes=1) async_result = pool.apply_async(item, args, kwargs) # raises a TimeoutError if execution exceeds max_timeout return async_result.get(max_timeout)
def train(current_time, loaded_version): """ Train the models using the data generated by the self-play """ last_id = 0 total_ite = 0 lr = LR version = 1 pool = False criterion = AlphaLoss() dataset = SelfPlayDataset() ## Database connection client = MongoClient() collection = client.superGo[current_time] ## First player either from disk or fresh if loaded_version: player, checkpoint = load_player(current_time, loaded_version) optimizer = create_optimizer(player, lr, param=checkpoint['optimizer']) total_ite = checkpoint['total_ite'] lr = checkpoint['lr'] version = checkpoint['version'] last_id = collection.find().count() - (MOVES // MOVE_LIMIT) * 2 #last_id = collection.find().count() - 1 else: player = Player() optimizer = create_optimizer(player, lr) state = create_state(version, lr, total_ite, optimizer) player.save_models(state, current_time) best_player = deepcopy(player) ## Callback after the evaluation is done, must be a closure def new_agent(result): if result: nonlocal version, pending_player, current_time, \ lr, total_ite, best_player version += 1 state = create_state(version, lr, total_ite, optimizer) best_player = pending_player pending_player.save_models(state, current_time) print("[EVALUATION] New best player saved !") else: nonlocal last_id ## Force a new fetch in case the player didnt improve last_id = fetch_new_games(collection, dataset, last_id) ## Wait before the circular before is full while len(dataset) < MOVES: last_id = fetch_new_games(collection, dataset, last_id, loaded_version=loaded_version) time.sleep(30) print("[TRAIN] Circular buffer full !") print("[TRAIN] Starting to train !") dataloader = DataLoader(dataset, collate_fn=collate_fn, \ batch_size=BATCH_SIZE, shuffle=True) while True: batch_loss = [] for batch_idx, (state, move, winner) in enumerate(dataloader): running_loss = [] lr, optimizer = update_lr(lr, optimizer, total_ite) ## Evaluate a copy of the current network asynchronously if total_ite % TRAIN_STEPS == 0: if (pool): pending_player = deepcopy(player) last_id = fetch_new_games(collection, dataset, last_id) ## Wait in case an evaluation is still going on # if pool: # print("[EVALUATION] Waiting for eval to end before re-eval") # pool.close() # pool.join() pool = MyPool(1) try: pool.apply_async(evaluate, args=(pending_player, best_player), \ callback=new_agent) pool.close() pool.join() except Exception as e: client.close() pool.terminate() pool = True example = {'state': state, 'winner': winner, 'move': move} loss = train_epoch(player, optimizer, example, criterion) running_loss.append(loss) ## Print running loss if total_ite % LOSS_TICK == 0: print("[TRAIN] current iteration: %d, averaged loss: %.3f"\ % (total_ite, np.mean(running_loss))) batch_loss.append(np.mean(running_loss)) running_loss = [] ## Fetch new games if total_ite % REFRESH_TICK == 0: last_id = fetch_new_games(collection, dataset, last_id) total_ite += 1 if len(batch_loss) > 0: print("[TRAIN] Average backward pass loss : %.3f, current lr: %f" % (np.mean(batch_loss), lr))
def func_wrapper(*args, **kwargs): pool = multiprocessing.pool.ThreadPool(processes=1) async_result = pool.apply_async(item, args, kwargs) return async_result.get(max_timeout)
def func1(name): print(f"当前进程的ID:{os.getpid()}, {name}") sleep(2) return name def func2(args): print(args) if __name__ == "__main__": # 创建5个进程的进程池 pool = Pool(5) pool.apply_async(func=func1, args=("sxt1", ), callback=func2) pool.apply_async(func=func1, args=("sxt2", ), callback=func2) pool.apply_async(func=func1, args=("sxt3", ), callback=func2) pool.apply_async(func=func1, args=("sxt4", )) pool.apply_async(func=func1, args=("sxt5", )) pool.apply_async(func=func1, args=("sxt6", )) pool.apply_async(func=func1, args=("sxt7", )) pool.apply_async(func=func1, args=("sxt8", )) # 关闭进程池 pool.close() # 回收进程池 pool.join() x = "The are %d types of people." % 10
def parallel_get(self, urls: List[str]) -> List[Response]: """GET multiple URLs in parallel.""" # FIXME doesn't respect timing() and other object properties urls = decode_object_from_bytes_if_needed(urls) # Original implementation didn't raise on undefined / empty list of URLs if urls is None: return [] if len(urls) == 0: return [] # Remove duplicates from list while maintaining order because: # 1) We don't want to fetch the same URL twice # 2) URLs are being used as unique dictionary IDs later on urls_before_removing_duplicates = urls.copy() urls = list(OrderedDict.fromkeys(urls)) if len(urls) != len(urls_before_removing_duplicates): log.warning("Some of the URLs are duplicate; URLs: %s" % str(urls_before_removing_duplicates)) # Raise on one or more invalid URLs because we consider it a caller's problem; if URL at least looks valid, # get() in a fork should be able to come up with a reasonable Response object for it for url in urls: if not is_http_url(url): raise McParallelGetException( "URL %s is not a valid URL; URLs: %s" % ( url, str(urls), )) num_parallel = self._user_agent_config.parallel_get_num_parallel() timeout = self._user_agent_config.parallel_get_timeout() per_domain_timeout = self._user_agent_config.parallel_get_per_domain_timeout( ) url_stack = UserAgent.__get_scheduled_urls( urls_=urls, per_domain_timeout_=per_domain_timeout) start_time = time.time() url_blocks = {} while len(url_stack) > 0: block_i = len(url_stack) % num_parallel if block_i not in url_blocks: url_blocks[block_i] = [] url_blocks[block_i].append(url_stack.pop()) # Using ThreadPool instead of Pool because this sometimes gets called from a Celery worker, and if it does, # it might fail with: # # Traceback (most recent call last): # File "/opt/mediacloud/src/common/python/mediawords/util/web/user_agent/__init__.py", line 505, in parallel_get # pool = multiprocessing.Pool(processes=num_parallel) # File "/usr/lib/python3.7/multiprocessing/context.py", line 119, in Pool # context=self.get_context()) # File "/usr/lib/python3.7/multiprocessing/pool.py", line 176, in __init__ # self._repopulate_pool() # File "/usr/lib/python3.7/multiprocessing/pool.py", line 241, in _repopulate_pool # w.start() # File "/usr/lib/python3.7/multiprocessing/process.py", line 110, in start # 'daemonic processes are not allowed to have children' # AssertionError: daemonic processes are not allowed to have children # pool = multiprocessing.pool.ThreadPool(processes=num_parallel) all_results = [] for i, url_block in url_blocks.items(): result = pool.apply_async(_parallel_get_web_store, args=( url_block, start_time, timeout, )) all_results.append(result) all_responses = [] for result in all_results: responses = result.get() all_responses = all_responses + responses # No timeouts here because we trust the workers to timeout by themselves (by UserAgent) pool.close() pool.join() pool.terminate() # Sort URLs in parameter order # (if URLs weren't split into blocks, we could probably use map_async) response_url_map = {} for response in all_responses: url = response.scheduled_url.url response_url_map[url] = response.response sorted_responses = [] for url in urls: if url not in response_url_map: raise McParallelGetException( "URL %s is not in the response URL map %s." % ( url, response_url_map, )) sorted_responses.append(response_url_map[url]) if len(urls) != len(sorted_responses): raise McParallelGetException( "Response count doesn't match URL count; responses: %s; URLs: %s" % ( sorted_responses, urls, )) return sorted_responses
def __init__(self, directory, image_data_generator=None, target_size=(256, 256), color_mode='rgb', classes=None, class_mode='categorical', batch_size=32, shuffle=True, seed=None, data_format=None, save_to_dir=None, save_prefix='', save_format='png', followlinks=False, image_ext_list=config.IMAGE_EXTENSIONS): if data_format is None: data_format = K.image_data_format() self.directory = directory self.image_data_generator = image_data_generator self.target_size = tuple(target_size) if color_mode not in {'rgb', 'grayscale'}: raise ValueError('Invalid color mode:', color_mode, '; expected "rgb" or "grayscale".') self.color_mode = color_mode self.data_format = data_format if class_mode not in {'categorical', 'binary', 'sparse', 'input', None}: raise ValueError('Invalid class_mode:', class_mode, '; expected one of "categorical", ' '"binary", "sparse", "input"' ' or None.') self.class_mode = class_mode self.save_to_dir = save_to_dir self.save_prefix = save_prefix self.save_format = save_format # first, count the number of samples and classes self.samples = 0 if not classes: classes = [] for subdir in sorted(os.listdir(directory)): if os.path.isdir(os.path.join(directory, subdir)): classes.append(subdir) self.num_class = len(classes) self.class_indices = dict(zip(classes, range(len(classes)))) pool = multiprocessing.pool.ThreadPool() function_partial = partial(_count_valid_files_in_directory, white_list_formats=image_ext_list, follow_links=followlinks) self.samples = sum(pool.map(function_partial, (os.path.join(directory, subdir) for subdir in classes))) print('Found {0} images belonging to {1} classes.'.format( self.samples, self.num_class)) # second, build an index of the images in different class subfolders results = [] self.filepaths = [] self.classes = np.zeros((self.samples,), dtype='int32') i = 0 for dirpath in (os.path.join(directory, subdir) for subdir in classes): results.append(pool.apply_async(list_valid_filepaths_in_directory, (dirpath, image_ext_list, self.class_indices, followlinks))) for res in results: classes, filepaths = res.get() self.classes[i:i + len(classes)] = classes self.filepaths += filepaths i += len(classes) self.filepaths = np.array(self.filepaths) pool.close() pool.join() super(DirIterator, self).__init__(self.samples, batch_size, shuffle, seed)
def minimize(p0, pop_size=2, generations=2, processes=4): crossover_probability = 0.02 mutation_probability = 0.1 new_probability = 0.75 pop = [] pop.append(Individual(p0, clone=True)) for i in range(pop_size): pop.append(Individual(p0)) # Evaluate the entire population pool = MyPool(processes=processes) jobs = [] for individual in pop: inputs = (individual.p, individual.ssn) jobs.append(pool.apply_async(evaluate, args=inputs)) pool.close() pool.join() for job, individual in zip(jobs, pop): fitness, scalar = job.get() individual.setFitness(fitness) individual.setScalar(scalar) for gen in range(generations): offspring = [] # create a new population member and mix it with the best if np.random.rand() < new_probability: new_member = Individual(pop[0].p) offspring.append(new_member) child1, child2, changed = new_member.crossover(pop[0]) if changed: offspring.append(child1) offspring.append(child2) # iterate over each individual in the population for i, individual in enumerate(pop): if np.random.rand() < crossover_probability: # Crossover with a random, non-identical partner partner = np.random.randint(len(pop)) while partner == i: partner = np.random.randint(len(pop)) child1, child2, changed = individual.crossover(pop[partner]) if changed: offspring.append(child1) offspring.append(child2) if np.random.rand() < mutation_probability: # Create a mutant mutant = individual.clone_self() if mutant.mutate(): offspring.append(mutant) # Evaluate the offspring if len(offspring) > 0: pool = MyPool(processes=processes) jobs = [] for individual in offspring: inputs = (individual.p, individual.ssn) jobs.append(pool.apply_async(evaluate, inputs)) pool.close() pool.join() for job, individual in zip(jobs, offspring): fitness, scalar = job.get() individual.setFitness(fitness) individual.setScalar(scalar) new_pop = pop + offspring def find_value(ind): return ind.fitness new_pop.sort(key=find_value) ''' # Ensure Genetic Diversity! - Because they are already sorted, we only # need to compare neighbors previous_p = p0[:-1] * 0.00 # Exclude the scalar for individual in new_pop: this_p = individual.p[:-1] # Exclude the scalar if (this_p == previous_p).all(): new_pop.remove(individual) else: previous_p == this_p ''' pop = new_pop[:pop_size] return pop