def _map_parallel(function, args, n_jobs): """multiprocessing.Pool(processors=n_jobs).map with some error checking""" # Following the error checking found in joblib multiprocessing = int(os.environ.get('JOBLIB_MULTIPROCESSING', 1)) or None if multiprocessing: try: import multiprocessing import multiprocessing.pool except ImportError: multiprocessing = None # 2nd stage: validate that locking is available on the system and # issue a warning if not if multiprocessing: try: _sem = multiprocessing.Semaphore() del _sem # cleanup except (ImportError, OSError) as e: multiprocessing = None warnings.warn('%s. _map_parallel will operate in serial mode' % (e,)) if multiprocessing and int(n_jobs) not in (0, 1): if n_jobs == -1: n_jobs = None pool = multiprocessing.Pool(processes=n_jobs) map_result = pool.map(function, args) pool.close() pool.join() else: map_result = list(map(function, args)) return map_result
def run_trajectory(t, ps, landscape, ptv, num_iterations, num_processors): # Get the points in the trajectory points = t.points() # Determine the index of each unique point (sometimes points are equal due to rounding) uinds = [i for i, p in enumerate(points) if i == 0 or not p.equals(points[i - 1])] # Create a process pool, using as many processors as are available, or # are required to allow each point to run concurrently pool = mp.Pool(processes=min(num_processors, len(points))) results = [] for i in uinds: # Modify the parameter set to match the current point psm = ps.copy() psm.modify_for_point(points[i], ptv) psm.convert_to_age_classes() # Launch a process to run the simulation(s) for the point. This modifies the point in place args = [points[i], psm, landscape, num_iterations, num_processors] results.append(pool.apply_async(run_iterations_for_point, args)) pool.close() pool.join() # Merge the unique and non-unique points back together for i, r in zip(uinds, results): points[i] = r.get(None) # Return a new trajectory containing the results for each point return io.Trajectory(points=points)
def ScopedPool(*args, **kwargs): """Context Manager which returns a multiprocessing.pool instance which correctly deals with thrown exceptions. *args - Arguments to multiprocessing.pool Kwargs: kind ('threads', 'procs') - The type of underlying coprocess to use. **etc - Arguments to multiprocessing.pool """ if kwargs.pop('kind', None) == 'threads': pool = multiprocessing.pool.ThreadPool(*args, **kwargs) else: orig, orig_args = kwargs.get('initializer'), kwargs.get('initargs', ()) kwargs['initializer'] = _ScopedPool_initer kwargs['initargs'] = orig, orig_args pool = multiprocessing.pool.Pool(*args, **kwargs) try: yield pool pool.close() except: pool.terminate() raise finally: pool.join()
def test_no_thread_pool(): pool = xmon_stepper.ThreadlessPool() result = pool.map(lambda x: x + 1, range(10)) assert result == [x + 1 for x in range(10)] # No ops. pool.terminate() pool.join()
def refine(L, X, D, e, a, b, k, num_workers, metric): """ Throw out bad points (algorithm 7, lines 7-17) :param L: List of subsets :param X: Data matrix :param D: dictionary :param e: lower bound on fractional size of each cluster :param a: lower bound on fractional size of a set inside own cluster for which stability holds :param b: lower bound on fractional size of a set outside own cluster for which stability holds :param k: Number of clusters :param num_workers: Number of workers :param metric: metric is in {avg, max, min} :return: Refined clusters """ print("Getting rid of bad points") print("Length of L at start = ", len(L)) start = time.time() n = len(X) T = int((e - 2 * a - b * k) * n) t = int((e - a) * n) with Pool() as pool: func = partial(refine_individual, D, T, t) L = pool.map(func, L) pool.close() pool.join() end = time.time() print("Length of L on end = ", len(L)) print("time = {0:.2f}s".format(end - start)) return grow(L, X, a, num_workers, metric)
def from_carrays(path, format_categories='bcolz', format_codes='bcolz', format_values='bcolz', parallel=True): assert os.path.exists(path), 'No path {}'.format(path) df_columns = glob.glob(os.path.join(path, '*')) df = dict() if parallel: pool = multiprocessing.pool.ThreadPool() results = [] for i, k in enumerate(df_columns): p = pool.apply_async(_from_carray, args=(k,), kwds={'format_categories': format_categories, 'format_codes': format_codes, 'format_values': format_values}) results.append(p) pool.close() pool.join() for x in results: meta, s = x.get() df[meta['name']] = s else: for i, k in enumerate(df_columns): meta, s = _from_carray(k, format_categories=format_categories, format_codes=format_codes, format_values=format_values) df[meta['name']] = s # # # this is slow when we have non categoricals as series for some reason with log.timedlogger('constructing dataframe from %s column dict' % len(df)): df = pandas.DataFrame(df) # TODO: fast DataFrame constructor return df
def parallel_compile(self, sources, output_dir=None, macros=None, include_dirs=None, debug=0, extra_preargs=None, extra_postargs=None, depends=None): """New compile function that we monkey patch into the existing compiler instance. """ import multiprocessing.pool # Copied from the regular compile function macros, objects, extra_postargs, pp_opts, build = \ self._setup_compile(output_dir, macros, include_dirs, sources, depends, extra_postargs) cc_args = self._get_cc_args(pp_opts, debug, extra_preargs) def _single_compile(obj): try: src, ext = build[obj] except KeyError: return self._compile(obj, src, ext, cc_args, extra_postargs, pp_opts) # Set by fix_compiler global glob_use_njobs if glob_use_njobs == 1: # This is equivalent to regular compile function for obj in objects: _single_compile(obj) else: # Use ThreadPool, rather than Pool, since the objects are picklable. pool = multiprocessing.pool.ThreadPool(glob_use_njobs) pool.map(_single_compile, objects) pool.close() pool.join() # Return *all* object filenames, not just the ones we just built. return objects
def pass_data_to_search(symbol,path,start_time_seconds,end_time_seconds,date,time_interval,tt,code_path): jobs=[] dic_files={} lis=[] slot_results=[] file_name = path+'b'+date+'.l.bz2' # file_name = path + date+'/'+dic_files[lis[index]]+'.bz2' size=os.path.getsize(file_name) total_rows=size/69 total_processes1=40 slots=total_rows/total_processes1 #Multiprocessing each file as chunk # mapper(0,slots,total_processes1,symbol,start_time_seconds,end_time_seconds,date,time_interval,file_name,tt,code_path) # mapper(1,slots,total_processes1,symbol,start_time_seconds,end_time_seconds,date,time_interval,file_name,tt,code_path) pool = multiprocessing.Pool(total_processes1) for i in range(total_processes1): pool.apply_async(mapper, args = (i,slots,total_processes1,symbol,start_time_seconds,end_time_seconds,date,time_interval,file_name,tt,code_path)) pool.close() pool.join()
def test(): print("Creating 5 (non-daemon) workers and jobs in main process.") pool = MyPool(5) result = pool.map(work, [randint(1, 5) for x in range(5)]) pool.close() pool.join() print(result)
def _map_parallel(function, args, n_jobs): """multiprocessing.Pool(processors=n_jobs).map with some error checking""" # Following the error checking found in joblib multiprocessing = int(os.environ.get('JOBLIB_MULTIPROCESSING', 1)) or None if multiprocessing: try: import multiprocessing import multiprocessing.pool except ImportError: multiprocessing = None if sys.platform.startswith("win") and PY2: msg = "Multiprocessing is not supported on Windows with Python 2.X. Setting n_jobs=1" logger.warning(msg) n_jobs = 1 # 2nd stage: validate that locking is available on the system and # issue a warning if not if multiprocessing: try: _sem = multiprocessing.Semaphore() del _sem # cleanup except (ImportError, OSError) as e: multiprocessing = None logger.warning('{}. _map_parallel will operate in serial mode'.format(e)) if multiprocessing and int(n_jobs) not in (0, 1): if n_jobs == -1: n_jobs = None try: pool = multiprocessing.Pool(processes=n_jobs) map_result = pool.map(function, args) finally: pool.close() pool.join() else: map_result = list(map(function, args)) return map_result
def main(): if len(sys.argv) < 3: print("Syntax:") print( " {} [min_yeses] [out_csv_file]".format( sys.argv[0] ) ) sys.exit(1) min_yeses = eval(sys.argv[1]) out_csv_file = sys.argv[2] pconfig = config.PaperworkConfig() pconfig.read() src_dir = pconfig.settings['workdir'].value print("Source work directory : {}".format(src_dir)) src_dsearch = docsearch.DocSearch(src_dir) src_dsearch.reload_index() nb_threads = multiprocessing.cpu_count() pool = multiprocessing.pool.ThreadPool(processes=nb_threads) with open(out_csv_file, 'a', newline='') as csvfile: csvwriter = csv.writer(csvfile) for min_yes in min_yeses: pool.apply_async( _run_simulation, (src_dsearch, min_yes, csvwriter,) ) pool.close() pool.join() print("All done !")
def _listArtifacts(self, urls, gavs): """ Loads maven artifacts from list of GAVs and tries to locate the artifacts in one of the specified repositories. :param urls: repository URLs where the given GAVs can be located :param gavs: List of GAVs :returns: Dictionary where index is MavenArtifact object and value is it's repo root URL. """ def findArtifact(gav, urls, artifacts): artifact = MavenArtifact.createFromGAV(gav) for url in urls: if maven_repo_util.gavExists(url, artifact): #Critical section? artifacts[artifact] = ArtifactSpec(url, [ArtifactType(artifact.artifactType, True, set(['']))]) return logging.warning('Artifact %s not found in any url!', artifact) artifacts = {} pool = ThreadPool(maven_repo_util.MAX_THREADS) for gav in gavs: pool.apply_async(findArtifact, [gav, urls, artifacts]) # Close the pool and wait for the workers to finnish pool.close() pool.join() return artifacts
def slippy_test(test_options, width=TILE_WIDTH, height=TILE_HEIGHT, tile_factor=TILE_FACTOR): #assume each screen is a 10x5 grid of tiles #this approximately the OTM map size at full screen #at my desk z = test_options['z'] x = test_options['x'] y = test_options['y'] url_prefix = test_options['url_prefix'] tiles_to_request = [] for x_iter in range(x - width/2, x + width/2 - 1): for y_iter in range(y - height/2, y + height/2 - 1): tiles_to_request.append(url_prefix + '%d/%d/%d.png' % (z, x_iter, y_iter)) pool = multiprocessing.Pool(processes=tile_factor) start_time = time.time() results = pool.map(slippy_test_helper, tiles_to_request) end_time = time.time() pool.close() pool.join() sys.stderr.write('.') if(False in results): return '%d,ERROR,%f' % (-1, float('nan')) return '%d,OK,' % z + str(end_time - start_time)
def threshold(X, e, a, b, k, num_workers, metric): """ Get all threshold clusters (algorithm 7, lines 1-6) :param X: Data matrix :param e: lower bound on fractional size of each cluster :param a: lower bound on fractional size of a set inside own cluster for which stability holds :param b: lower bound on fractional size of a set outside own cluster for which stability holds :param k: Number of clusters :param num_workers: Number of workers :param metric: metric is in the set {avg, min, max} :return: Threshold clusters """ print("Populating list with all threshold clusters with metric:", metric) start = time.time() n = len(X) minsize = int(e * n) with Pool(num_workers) as pool: func = partial(get_thresholds, X, minsize, num_workers, metric) items = pool.map(func, range(n)) pool.close() pool.join() threshold_lists = [item[0] for item in items] L = [item for sublist in threshold_lists for item in sublist] D = dict([(item[1], item[2]) for item in items]) end = time.time() print("Length of L = ", len(L)) print("time = {0:.2f}s".format(end - start)) return refine(L, X, D, e, a, b, k, num_workers, metric)
def count_intersect(self, threshold, frequency=True): self.counts = OrderedDict() self.rlen, self.qlen = {}, {} self.nalist = [] if frequency: self.frequency = OrderedDict() # if self.mode_count == "bp": # print2(self.parameter, "\n{0}\t{1}\t{2}\t{3}\t{4}".format("Reference","Length(bp)", "Query", "Length(bp)", "Length of Intersection(bp)")) # elif self.mode_count == "count": # print2(self.parameter, "\n{0}\t{1}\t{2}\t{3}\t{4}".format("Reference","sequence_number", "Query", "sequence_number", "Number of Intersection")) for ty in self.groupedreference.keys(): self.counts[ty] = OrderedDict() self.rlen[ty], self.qlen[ty] = OrderedDict(), OrderedDict() if frequency: self.frequency[ty] = OrderedDict() for r in self.groupedreference[ty]: if r.total_coverage() == 0 and len(r) > 0: self.nalist.append(r.name) continue else: self.counts[ty][r.name] = OrderedDict() if self.mode_count == "bp": rlen = r.total_coverage() elif self.mode_count == "count": rlen = len(r) self.rlen[ty][r.name] = rlen mp_input = [] for q in self.groupedquery[ty]: if r.name == q.name: continue else: mp_input.append([q, self.nalist, self.mode_count, self.qlen, threshold, self.counts, frequency, self.frequency, ty, r]) # q, nalist, mode_count, qlen_dict, threshold, counts, frequency, self_frequency, ty, r pool = multiprocessing.Pool(processes=multiprocessing.cpu_count() - 1) mp_output = pool.map(mp_count_intersect, mp_input) pool.close() pool.join() # qname, nalist, qlen_dict[ty][q.name], counts[ty][r.name][q.name], self_frequency[ty][q.name].append(c[2]) for output in mp_output: if output[1]: self.nalist.append(output[1]) else: self.qlen[ty][output[0]] = output[2] self.counts[ty][r.name][output[0]] = output[3] # print(r.name) # print(output[0]) # print(output[3]) try: self.frequency[ty][output[0]][r.name] = output[3][2] except: self.frequency[ty][output[0]] = {} self.frequency[ty][output[0]][r.name] = output[3][2]
def _CompileDeps(aapt_path, dep_subdirs, temp_dir): partials_dir = os.path.join(temp_dir, 'partials') build_utils.MakeDirectory(partials_dir) partial_compile_command = [ aapt_path + '2', 'compile', # TODO(wnwen): Turn this on once aapt2 forces 9-patch to be crunched. # '--no-crunch', ] pool = multiprocessing.pool.ThreadPool(10) def compile_partial(directory): dirname = os.path.basename(directory) partial_path = os.path.join(partials_dir, dirname + '.zip') compile_command = (partial_compile_command + ['--dir', directory, '-o', partial_path]) build_utils.CheckOutput(compile_command) # Sorting the files in the partial ensures deterministic output from the # aapt2 link step which uses order of files in the partial. sorted_partial_path = os.path.join(partials_dir, dirname + '.sorted.zip') _SortZip(partial_path, sorted_partial_path) return sorted_partial_path partials = pool.map(compile_partial, dep_subdirs) pool.close() pool.join() return partials
def process_iteration(Ns, ps, landscape, config): output_dir = config.output_dir + config.ext if config.background_image != None: background_path = config.input_dir + "/" + config.background_image else: background_path = None #Create a point to hold the iteration p = Point() p.add_iteration() #draw_population(Ns[0], landscape, ps.totalK, 0, output_dir, 2.0, background_path) if config.display: pool = mp.Pool(config.num_processors) for t in xrange(min(ps.max_time_steps, len(Ns))): if config.display: pool.apply_async(draw_population, [Ns[t], landscape, ps.totalK, t, output_dir, 2.0, background_path]) p.add_time_step([t] + population_statistics(ps, landscape, Ns[t])) pool.close() #Write the iteration results to file as a trajectory containing a single point write_trajectories([Trajectory(points=[p])], None, ps.sentinels, output_dir + "/results.txt") if config.save_time_steps: np.savez(output_dir + "/populations.npz", *Ns) pool.join()
def run_abstraction_parallel(self): # initialization self.__get_methods() self.__read_config() self.__get_dataset() # get filename and properties filename_properties = [] for filename, properties in self.files.iteritems(): filename_properties.append((filename, properties)) # run experiment in multiprocessing mode total_cpu = multiprocessing.cpu_count() pool = NoDaemonProcessPool(processes=total_cpu) results = pool.map(self, filename_properties) pool.close() pool.join() # open evaluation file self.__check_path(self.files['evaluation_directory']) f = open(self.files['evaluation_file'], 'wt') writer = csv.writer(f) # set header for evaluation file header = [] if self.configuration['main']['abstraction'] == '1': header = self.configuration['abstraction_evaluation']['evaluation_file_header'].split('\n') writer.writerow(tuple(header)) # write experiment result for result in results: writer.writerow(result) # close evaluation file f.close()
def download_junit(db, threads, client_class): """Download junit results for builds without them.""" builds_to_grab = db.get_builds_missing_junit() pool = None if threads > 1: pool = multiprocessing.pool.ThreadPool( threads, mp_init_worker, ('', {}, client_class, False)) test_iterator = pool.imap_unordered( get_junits, builds_to_grab) else: global WORKER_CLIENT # pylint: disable=global-statement WORKER_CLIENT = client_class('', {}) test_iterator = ( get_junits(build_path) for build_path in builds_to_grab) for n, (build_id, build_path, junits) in enumerate(test_iterator, 1): print('%d/%d' % (n, len(builds_to_grab)), build_path, len(junits), len(''.join(junits.values()))) junits = {k: remove_system_out(v) for k, v in junits.iteritems()} db.insert_build_junits(build_id, junits) if n % 100 == 0: db.commit() db.commit() if pool: pool.close() pool.join()
def wrapper(*args, **kwargs): try: return f(*args, **kwargs) except Exception as e: import traceback print(traceback.format_exc()) pool.join() sys.exit()
def create_process_pool(index): print index li = range(3) pool = multiprocessing.Pool(processes = len(li)) for sub_index in li: pool.apply_async(print_process_index, (index, sub_index)) pool.close() pool.join()
def work(num_procs): print("Creating %i (daemon) workers and jobs in child." % num_procs) pool = multiprocessing.Pool(num_procs) result = pool.map(sleepwhile, [randint(1, 5) for x in range(num_procs)]) pool.close() pool.join() return result
def laminar(L, X, e, a, b, num_workers, metric): """ Make family laminar (Algorithm 9) :param L: List of subsets :param X: The data set :param e: lower bound on the fractional size of every cluster :param a: lower bound on the fractional size of every set in own cluster for which stability holds :param b: lower bound on the fractional size of every set in outside cluster for which stability holds :param num_workers: number of workers :param metric: metric is in {avg, max, min} :return: Laminar list """ print("Making the list laminar (parallel)") start = time.time() n = len(X) print("Computing pairs of non-laminar sets") with Pool(num_workers) as pool: func = partial(non_laminar, L) intersections = pool.map(func, range(len(L) - 1)) pool.close() pool.join() intersections = [item for sub_list in intersections for item in sub_list] end = time.time() fname = "intersections_" + metric + ".pkl.gz" # with gzip.open(fname, 'wb') as f: # pickle.dump(intersections, f) print("Length of intersections = ", len(intersections)) print("time = {0:0.2f}s".format(end - start)) print("Removing non-laminar pairs") start = time.time() manager = Manager() shared_L = manager.list(L) n = len(intersections) j = 0 batch = int(n / num_workers) rem = n % num_workers jobs = [] for i in range(num_workers): process = Process( target=iterate_laminar, args=(shared_L, X, e, a, b, num_workers, metric, intersections[j : j + batch]) ) process.start() jobs.append(process) j += batch if rem: process = Process( target=iterate_laminar, args=(shared_L, X, e, a, b, num_workers, metric, intersections[j : j + rem]) ) process.start() jobs.append(process) for p in jobs: p.join() L = [item for item in shared_L if item is not None] end = time.time() print("Length of list after removing non-laminar pairs = ", len(L)) print("time = {0:.2f}s".format(end - start)) return L
def update_all(opts): """Updates all menus""" pool = NoDaemonPool(processes=5) pool.apply_async(update_applications, (opts,)) pool.apply_async(update_bookmarks, (opts,)) pool.apply_async(update_recent_files, (opts,)) pool.apply_async(update_devices, (opts,)) pool.apply_async(update_rootmenu, (opts,)) pool.close() pool.join()
def test(): print("Creating 5 (non-daemon) workers and jobs in main process.") year = [x for x in range(2008, 2014)] pool = CustomPool(len(year)*4) result = pool.map(work,year) pool.close() pool.join()
def work(num_procs): print "Creating %i (daemon) workers and jobs in child." % num_procs pool = multiprocessing.Pool(num_procs) result = pool.map(sleepawhile, [randint(1, 5) for x in range(num_procs)]) # The following is not really needed, since the (daemon) workers of the # child's pool are killed when the child is terminated, but it's good # practice to cleanup after ourselves anyway. pool.close() pool.join() return result
def _ConvertToWebP(webp_binary, png_files): pool = multiprocessing.pool.ThreadPool(10) def convert_image(png_path): root = os.path.splitext(png_path)[0] webp_path = root + '.webp' args = [webp_binary, png_path] + _PNG_TO_WEBP_ARGS + [webp_path] subprocess.check_call(args) os.remove(png_path) # Android requires pngs for 9-patch images. pool.map(convert_image, [f for f in png_files if not f.endswith('.9.png')]) pool.close() pool.join()
def parse_sam_in_threads(remap_csv, nthreads): """ Call parse_sam() in multiple processes. Launch a multiprocessing pool, walk through the iterator, and then be sure to close the pool at the end. """ pool = Pool(processes=nthreads) try: reads = pool.imap(parse_sam, iterable=matchmaker(remap_csv), chunksize=100) for read in reads: yield read finally: pool.close() pool.join()
def _ConvertToWebP(webp_binary, png_files): pool = multiprocessing.pool.ThreadPool(10) def convert_image(png_path): root = os.path.splitext(png_path)[0] webp_path = root + '.webp' args = [webp_binary, png_path, '-mt', '-quiet', '-m', '6', '-q', '100', '-lossless', '-o', webp_path] subprocess.check_call(args) os.remove(png_path) pool.map(convert_image, [f for f in png_files if not _PNG_WEBP_BLACKLIST_PATTERN.match(f)]) pool.close() pool.join()
def buildList(self): """ Build the artifact "list" from sources defined in the given configuration. :returns: Dictionary described above. """ priority = 0 pool_dict = {} for source in self.configuration.artifactSources: priority += 1 pool = pool_dict.setdefault(source['type'], ThreadPool(self.MAX_THREADS_DICT[source['type']])) pool.apply_async(self._read_artifact_source, args=[source, priority], callback=self._add_result) for pool in pool_dict.values(): pool.close() at_least_1_runs = True all_keys = range(1, len(self.configuration.artifactSources) + 1) finished = False while at_least_1_runs: for i in range(30): time.sleep(1) if not self.errors.empty(): for pool in pool_dict.values(): logging.debug("Terminating pool %s", str(pool)) pool.terminate() finished = True break at_least_1_runs = False if not finished: self.results_lock.acquire() finished = sorted(list(self.results.keys())) self.results_lock.release() if all_keys != finished: logging.debug("Still waiting for priorities %s to finish", str(list(set(all_keys) - set(finished)))) at_least_1_runs = True for pool in pool_dict.values(): if pool._state != multiprocessing.pool.TERMINATE: pool.join() if not self.errors.empty(): raise RuntimeError("%i error(s) occured during reading of artifact list." % self.errors.qsize()) return self._get_artifact_list()
def __init__(self, directory, window_size, window_stride, window_type, normalize, max_len=101, target_size=(256, 256), color_mode='grayscale', classes=None, class_mode='categorical', batch_size=32, shuffle=True, seed=None, data_format=None, save_to_dir=None, save_prefix='', save_format='png', follow_links=False, interpolation='nearest', augment=False, allow_speedandpitch=False, allow_pitch=False, allow_speed=False, allow_dyn=False, allow_noise=False, allow_timeshift=False): if data_format is None: data_format = K.image_data_format() self.window_size = window_size self.window_stride = window_stride self.window_type = window_type self.normalize = normalize self.max_len = max_len self.directory = directory self.allow_speedandpitch = allow_speedandpitch self.allow_pitch = allow_pitch self.allow_speed = allow_speed self.allow_dyn = allow_dyn self.allow_noise = allow_noise self.allow_timeshift = allow_timeshift self.augment = augment # self.image_data_generator = image_data_generator self.target_size = tuple(target_size) if color_mode not in {'rgb', 'grayscale'}: raise ValueError('Invalid color mode:', color_mode, '; expected "rgb" or "grayscale".') self.color_mode = color_mode self.data_format = data_format if self.color_mode == 'rgb': if self.data_format == 'channels_last': self.image_shape = self.target_size + (3, ) else: self.image_shape = (3, ) + self.target_size else: if self.data_format == 'channels_last': self.image_shape = self.target_size + (1, ) else: self.image_shape = (1, ) + self.target_size self.classes = classes if class_mode not in { 'categorical', 'binary', 'sparse', 'input', None }: raise ValueError( 'Invalid class_mode:', class_mode, '; expected one of "categorical", ' '"binary", "sparse", "input"' ' or None.') self.class_mode = class_mode self.save_to_dir = save_to_dir self.save_prefix = save_prefix self.save_format = save_format self.interpolation = interpolation white_list_formats = {'png', 'jpg', 'jpeg', 'bmp', 'ppm', 'wav'} # first, count the number of samples and classes self.samples = 0 if not classes: classes = [] for subdir in sorted(os.listdir(directory)): if os.path.isdir(os.path.join(directory, subdir)): classes.append(subdir) self.num_classes = len(classes) self.class_indices = dict(zip(classes, range(len(classes)))) pool = multiprocessing.pool.ThreadPool() function_partial = partial(_count_valid_files_in_directory, white_list_formats=white_list_formats, follow_links=follow_links) self.samples = sum( pool.map(function_partial, (os.path.join(directory, subdir) for subdir in classes))) print('Found %d images belonging to %d classes.' % (self.samples, self.num_classes)) # second, build an index of the images in the different class subfolders results = [] self.filenames = [] self.classes = np.zeros((self.samples, ), dtype='int32') i = 0 for dirpath in (os.path.join(directory, subdir) for subdir in classes): results.append( pool.apply_async(_list_valid_filenames_in_directory, (dirpath, white_list_formats, self.class_indices, follow_links))) for res in results: classes, filenames = res.get() self.classes[i:i + len(classes)] = classes self.filenames += filenames if i == 0: img = spect_loader(os.path.join(self.directory, filenames[0]), self.window_size, self.window_stride, self.window_type, self.normalize, self.max_len, self.augment, self.allow_speedandpitch, self.allow_pitch, self.allow_speed, self.allow_dyn, self.allow_noise, self.allow_timeshift) img = np.swapaxes(img, 0, 2) self.target_size = tuple((img.shape[0], img.shape[1])) print(self.target_size) if self.color_mode == 'rgb': if self.data_format == 'channels_last': self.image_shape = self.target_size + (3, ) else: self.image_shape = (3, ) + self.target_size else: if self.data_format == 'channels_last': self.image_shape = self.target_size + (1, ) else: self.image_shape = (1, ) + self.target_size i += len(classes) pool.close() pool.join() super(SpeechDirectoryIterator, self).__init__(self.samples, batch_size, shuffle, seed)
def glitch_code(test_code, architecture, bytes_to_trash_in, flip_type, pool_instances, enable_cache=True, force_invalid=False): """ :param code: :param bytes_to_trash: the index into the bytes (these must be sequential!) :param architecture: :return: """ global code_initial, bytes_to_trash, flip_operation, result_cache global force_invalid_ins force_invalid_ins = force_invalid if not enable_cache: result_cache = None bytes_to_trash = bytes_to_trash_in if not init_architecture(architecture): return {} # assemble the program code_initial, count = KEYSTONE.asm(test_code) code = ''.join(map(chr, code_initial)) asm = list(CAPSTONE.disasm(code, len(code_initial))) if len(asm) == 0: logger.warning('>>> \tdisasm failure'.format(code.encode('hex'))) for ins in asm: logger.info('>>> {}\t{} {}'.format(code.encode('hex'), ins.mnemonic, ins.op_str)) flip_operation = flip_type flip_type_str = flip_type.name # Init our pool if pool_instances is None: pool = MyPool() else: pool = MyPool(pool_instances) initial_bytes = [code_initial[x] for x in bytes_to_trash] results = { 'input': { 'initial_bytes': initial_bytes, 'initial_code': code_initial }, flip_type_str: {} } bit_list = range(len(bytes_to_trash) * 8) bit_count = range(len(bytes_to_trash) * 8 + 1) # e.g., 0 to 16 bits to flip to include edge cases for number_of_bits_to_flip in bit_count: # Init our results results[flip_type_str][number_of_bits_to_flip] = {} # Let's fire off all of our threads in the pool logger.info("* Trying %d bit flips (%s)..." % (number_of_bits_to_flip, flip_type_str)) rtn_vals = pool.imap( test_program, itertools.combinations(bit_list, number_of_bits_to_flip)) # Aggregate all of our results for rtn in rtn_vals: if rtn not in results[flip_type_str][number_of_bits_to_flip]: results[flip_type_str][number_of_bits_to_flip][rtn] = 1 else: results[flip_type_str][number_of_bits_to_flip][rtn] += 1 # Let's print them at each iteration to have some idea of progress # pprint.pprint(results[flip_type_str][number_of_bits_to_flip]) logger.info( pprint.pformat(results[flip_type_str][number_of_bits_to_flip])) # logger.info(pprint.pformat(results)) result_cache = {} # Close our pool up pool.close() pool.terminate() pool.join() # Clear cache, it's unlikely to be useful in the next run if result_cache is not None: result_cache = {} return results
def buildDecisionTree(df, root, file, config, dataset_features, parent_level = 0, leaf_id = 0, parents = 'root', validation_df = None): models = [] feature_names = df.columns[0:-1] enableParallelism = config['enableParallelism'] algorithm = config['algorithm'] json_file = file.split(".")[0]+".json" if root == 1: if config['enableRandomForest'] != True and config['enableGBM'] != True and config['enableAdaboost'] != True: raw_df = df.copy() #-------------------------------------- df_copy = df.copy() winner_name, num_of_instances, metric, metric_name = findDecision(df, config) #find winner index, this cannot be returned by find decision because columns dropped in previous steps j = 0 for i in dataset_features: if i == winner_name: winner_index = j j = j + 1 numericColumn = False if dataset_features[winner_name] != 'object': numericColumn = True #restoration columns = df.shape[1] for i in range(0, columns-1): column_name = df.columns[i]; column_type = df[column_name].dtypes if column_type != 'object' and column_name != winner_name: df[column_name] = df_copy[column_name] classes = df[winner_name].value_counts().keys().tolist() #----------------------------------------------------- num_cores = config["num_cores"] input_params = [] #serial approach for i in range(0,len(classes)): current_class = classes[i] subdataset = df[df[winner_name] == current_class] subdataset = subdataset.drop(columns=[winner_name]) branch_index = i * 1 #create branches serially if enableParallelism != True: if i == 0: #descriptor = "# Feature: "+winner_name+", Instances: "+str(num_of_instances)+", "+metric_name+": "+str(round(metric, 4)) descriptor = { "feature": winner_name, "instances": num_of_instances, #"metric_name": metric_name, "metric_value": round(metric, 4), "depth": parent_level + 1 } descriptor = "# "+json.dumps(descriptor) functions.storeRule(file, (functions.formatRule(root), "", descriptor)) createBranch(config, current_class, subdataset, numericColumn, branch_index , winner_name, winner_index, root, parents, file, dataset_features, num_of_instances, metric) else: input_params.append((config, current_class, subdataset, numericColumn, branch_index , winner_name, winner_index, root, parents, file, dataset_features, num_of_instances, metric)) #--------------------------- #add else condition in the decision tree if df.Decision.dtypes == 'object': #classification pivot = pd.DataFrame(subdataset.Decision.value_counts()).reset_index() pivot = pivot.rename(columns = {"Decision": "Instances","index": "Decision"}) pivot = pivot.sort_values(by = ["Instances"], ascending = False).reset_index() else_decision = "return '%s'" % (pivot.iloc[0].Decision) if enableParallelism != True: functions.storeRule(file,(functions.formatRule(root), "else:")) functions.storeRule(file,(functions.formatRule(root+1), else_decision)) else: #parallelism leaf_id = str(uuid.uuid1()) custom_rule_file = "outputs/rules/"+str(leaf_id)+".txt" check_rule = "else: "+else_decision sample_rule = {} sample_rule["current_level"] = root sample_rule["leaf_id"] = leaf_id sample_rule["parents"] = parents sample_rule["rule"] = check_rule sample_rule["feature_idx"] = -1 sample_rule["feature_name"] = "" sample_rule["instances"] = df.shape[0] sample_rule["metric"] = 0 sample_rule["return_statement"] = 0 #json to string sample_rule = json.dumps(sample_rule) functions.createFile(custom_rule_file, "") functions.storeRule(custom_rule_file, sample_rule) else: #regression else_decision = "return %s" % (subdataset.Decision.mean()) if enableParallelism != True: functions.storeRule(file,(functions.formatRule(root), "else:")) functions.storeRule(file,(functions.formatRule(root+1), else_decision)) else: leaf_id = str(uuid.uuid1()) custom_rule_file = "outputs/rules/"+str(leaf_id)+".txt" check_rule = "else: "+else_decision sample_rule = " {\n" sample_rule += " \"current_level\": "+str(root)+",\n" sample_rule += " \"leaf_id\": \""+str(leaf_id)+"\",\n" sample_rule += " \"parents\": \""+parents+"\",\n" sample_rule += " \"rule\": \""+check_rule+"\"\n" sample_rule += " }" functions.createFile(custom_rule_file, "") functions.storeRule(custom_rule_file, sample_rule) #--------------------------- #create branches in parallel if enableParallelism == True: """ #this usage causes trouble for recursive functions with Pool(number_of_cpus) as pool: pool.starmap(createBranch, input_params) """ pool = MyPool(num_cores) results = pool.starmap(createBranch, input_params) pool.close() pool.join() #--------------------------------------------- if root == 1: if enableParallelism == True: #custom rules are stored in .txt files. merge them all in a json file functions.createFile(json_file, "[\n") custom_rules = [] file_index = 0 for file in os.listdir(os.getcwd()+"/outputs/rules"): if file.endswith(".txt"): custom_rules.append(os.getcwd()+"/outputs/rules/"+file) #print(file) #this file stores a custom rule f = open(os.getcwd()+"/outputs/rules/"+file, "r") custom_rule = f.read() if file_index > 0: custom_rule = ", "+custom_rule functions.storeRule(json_file, custom_rule) f.close() file_index = file_index + 1 functions.storeRule(json_file, "]") #----------------------------------- #custom rules are already merged in a json file. clear messy custom rules #TO-DO: if random forest trees are handled in parallel, this would be a problem. You cannot know the related tree of a rule. You should store a global tree id in a rule. for file in custom_rules: os.remove(file) #----------------------------------- reconstructRules(json_file, feature_names) #feature importance should be calculated by demand? feature_importance(json_file, dataset_features) #----------------------------------- #is regular decision tree if config['enableRandomForest'] != True and config['enableGBM'] != True and config['enableAdaboost'] != True: #this is reguler decision tree. find accuracy here. moduleName = "outputs/rules/rules" fp, pathname, description = imp.find_module(moduleName) myrules = imp.load_module(moduleName, fp, pathname, description) #rules0 models.append(myrules) return models
def _run_next_graph_nodes(graph, node, globals_, locals_, pool): operator = graph.node[node].get('OPERATOR', None) nodes_return_value = [] return_value = None # False? Terminate Flow. if isinstance(locals_['_'], bool) and locals_['_'] is False: return False if operator: # --> (a) # --> / | \ # (b) (c) (d) # \ | / # (e) next_nodes = sorted(graph.successors(node)) # N-1 for next_node in next_nodes[1:]: # Synchronous if operator == '|': nodes_return_value.append( pool.apply(_run, args=(graph, next_node, globals_, locals_, {}, None, False))) # Asynchronous if operator == '->': nodes_return_value.append( pool.apply_async(_run, args=(graph, next_node, globals_, locals_, {}, None, False))) # 1 nodes_return_value.insert( 0, _run(graph, next_nodes[0], globals_, locals_, {}, None, False)) pool.close() pool.join() pool.terminate() return_value = __resolve_and_merge_results(nodes_return_value) else: # (a) # / | \ # (b) (c) (d) # \ | / # --> (e) return_value = locals_['_'] return return_value
def __init__(self, directory, image_data_generator, target_size=(256, 256), color_mode='rgb', classes=None, class_mode='categorical', batch_size=32, shuffle=True, seed=None, data_format='channels_last', save_to_dir=None, save_prefix='', save_format='png', follow_links=False, subset=None, interpolation='nearest', dtype='float32'): super(DirectoryIterator, self).common_init(image_data_generator, target_size, color_mode, data_format, save_to_dir, save_prefix, save_format, subset, interpolation) self.directory = directory self.classes = classes if class_mode not in self.allowed_class_modes: raise ValueError( 'Invalid class_mode: {}; expected one of: {}'.format( class_mode, self.allowed_class_modes)) self.class_mode = class_mode self.dtype = dtype # First, count the number of samples and classes. self.samples = 0 if not classes: classes = [] for subdir in sorted(os.listdir(directory)): if os.path.isdir(os.path.join(directory, subdir)): classes.append(subdir) self.num_classes = len(classes) self.class_indices = dict(zip(classes, range(len(classes)))) pool = multiprocessing.pool.ThreadPool() # Second, build an index of the images # in the different class subfolders. results = [] self.filenames = [] i = 0 for dirpath in (os.path.join(directory, subdir) for subdir in classes): results.append( pool.apply_async(_list_valid_filenames_in_directory, (dirpath, self.white_list_formats, self.split, self.class_indices, follow_links))) classes_list = [] for res in results: classes, filenames = res.get() classes_list.append(classes) self.filenames += filenames self.samples = len(self.filenames) self.classes = np.zeros((self.samples, ), dtype='int32') for classes in classes_list: self.classes[i:i + len(classes)] = classes i += len(classes) print('Found %d images belonging to %d classes.' % (self.samples, self.num_classes)) pool.close() pool.join() super(DirectoryIterator, self).__init__(self.samples, batch_size, shuffle, seed)
def __init__(self, directory, image_data_generator, augmentations=None, target_size=(256, 256, 256), num_channels=1, num_patches=1, classes=None, class_mode='categorical', batch_size=32, shuffle=True, seed=None, save_to_dir=None, save_prefix='', save_format='png', axial_slice=None, follow_links=False, split=None): self.directory = directory self.image_data_generator = image_data_generator self.augmentations = augmentations self.target_size = tuple(target_size) self.num_channels = num_channels self.num_patches = num_patches self.image_shape = self.target_size + (num_channels, ) self.classes = classes if class_mode not in { 'categorical', 'binary', 'sparse', 'input', None }: raise ValueError( 'Invalid class_mode:', class_mode, '; expected one of "categorical", ' '"binary", "sparse", "input"' ' or None.') self.class_mode = class_mode self.save_to_dir = save_to_dir self.save_prefix = save_prefix self.save_format = save_format self.axial_slice = axial_slice white_list_formats = {'nii', 'nii.gz'} # Counter number of samples and classes self.samples = 0 if not classes: classes = [] for subdir in sorted(os.listdir(directory)): if os.path.isdir(os.path.join(directory, subdir)): classes.append(subdir) self.num_classes = len(classes) self.class_indices = dict(zip(classes, range(len(classes)))) pool = multiprocessing.pool.ThreadPool() function_partial = partial(_count_valid_files_in_directory, white_list_formats=white_list_formats, follow_links=follow_links, split=None) self.samples = sum( pool.map(function_partial, (os.path.join(directory, subdir) for subdir in classes))) print('Found %d images belonging to %d classes.' % (self.samples, self.num_classes)) # Build an index of the images in the different class subfolders results = [] self.filenames = [] self.classes = np.zeros((self.samples, ), dtype='int32') i = 0 for dirpath in (os.path.join(directory, subdir) for subdir in classes): results.append( pool.apply_async(_list_valid_filenames_in_directory, (dirpath, white_list_formats, split, self.class_indices, follow_links))) for res in results: classes, filenames = res.get() self.classes[i:i + len(classes)] = classes self.filenames += filenames i += len(classes) pool.close() pool.join() super(NIfTIDirectoryIterator, self).__init__(self.samples, batch_size, shuffle, seed)
def __init__(self, pathlists, classes, image_data_generator, target_size=(256, 256), color_mode='rgb', class_mode='categorical', batch_size=32, shuffle=True, seed=None, data_format='channels_last', save_to_dir=None, save_prefix='', save_format='png', subset=None, interpolation='nearest', dtype='float32'): if data_format is None: data_format = backend.image_data_format() self.image_data_generator = image_data_generator self.target_size = tuple(target_size) if color_mode not in {'rgb', 'rgba', 'grayscale'}: raise ValueError('Invalid color mode:', color_mode, '; expected "rgb", "rgba", or "grayscale".') self.color_mode = color_mode self.data_format = data_format if self.color_mode == 'rgba': if self.data_format == 'channels_last': self.image_shape = self.target_size + (4, ) else: self.image_shape = (4, ) + self.target_size elif self.color_mode == 'rgb': if self.data_format == 'channels_last': self.image_shape = self.target_size + (3, ) else: self.image_shape = (3, ) + self.target_size else: if self.data_format == 'channels_last': self.image_shape = self.target_size + (1, ) else: self.image_shape = (1, ) + self.target_size self.save_to_dir = save_to_dir self.save_prefix = save_prefix self.save_format = save_format self.interpolation = interpolation if subset is not None: validation_split = self.image_data_generator._validation_split if subset == 'validation': split = (0, validation_split) elif subset == 'training': split = (validation_split, 1) else: raise ValueError('Invalid subset name: ', subset, '; expected "training" or "validation"') else: split = None self.subset = subset self.pathlists = pathlists self.classes = classes if class_mode not in { 'categorical', 'binary', 'sparse', 'input', None }: raise ValueError( 'Invalid class_mode:', class_mode, '; expected one of "categorical", ' '"binary", "sparse", "input"' ' or None.') self.class_mode = class_mode self.dtype = dtype white_list_formats = { 'png', 'jpg', 'jpeg', 'bmp', 'ppm', 'tif', 'tiff' } # First, count the number of samples and classes. self.samples = 0 self.num_classes = len(classes) self.class_indices = dict(zip(classes, range(len(classes)))) pool = multiprocessing.pool.ThreadPool() self.samples = sum(pool.map(len, pathlists)) print('Found %d images belonging to %d classes.' % (self.samples, self.num_classes), file=sys.stderr) # Second, build an index of the images # in the different class subfolders. results = [] self.filenames = [] self.classes = np.zeros((self.samples, ), dtype='int32') i = 0 for pathlist, class_index in zip(pathlists, self.class_indices.items()): self.filenames += pathlist self.classes[i:i + len(pathlist)] = class_index[1] i += len(pathlist) pool.close() pool.join() super(PathListsIterator, self).__init__(self.samples, batch_size, shuffle, seed)
def _spider(url, visited, root, depth, max_depth, raise_on_error): """Fetches URL and any pages it links to up to max_depth. depth should initially be zero, and max_depth is the max depth of links to follow from the root. Prints out a warning only if the root can't be fetched; it ignores errors with pages that the root links to. Returns a tuple of: - pages: dict of pages visited (URL) mapped to their full text. - links: set of links encountered while visiting the pages. """ pages = {} # dict from page URL -> text content. links = set() # set of all links seen on visited pages. # root may end with index.html -- chop that off. if root.endswith('/index.html'): root = re.sub('/index.html$', '', root) try: response_url, page = _read_from_url(url, 'text/html') if not response_url or not page: return pages, links pages[response_url] = page # Parse out the links in the page link_parser = LinkParser() subcalls = [] link_parser.feed(page) while link_parser.links: raw_link = link_parser.links.pop() abs_link = urljoin(response_url, raw_link.strip()) links.add(abs_link) # Skip stuff that looks like an archive if any(raw_link.endswith(suf) for suf in ALLOWED_ARCHIVE_TYPES): continue # Skip things outside the root directory if not abs_link.startswith(root): continue # Skip already-visited links if abs_link in visited: continue # If we're not at max depth, follow links. if depth < max_depth: subcalls.append((abs_link, visited, root, depth + 1, max_depth, raise_on_error)) visited.add(abs_link) if subcalls: pool = NonDaemonPool(processes=len(subcalls)) try: results = pool.map(_spider_wrapper, subcalls) for sub_pages, sub_links in results: pages.update(sub_pages) links.update(sub_links) finally: pool.terminate() pool.join() except URLError as e: tty.debug(e) if hasattr(e, 'reason') and isinstance(e.reason, ssl.SSLError): tty.warn("Spack was unable to fetch url list due to a certificate " "verification problem. You can try running spack -k, " "which will not check SSL certificates. Use this at your " "own risk.") if raise_on_error: raise NoNetworkConnectionError(str(e), url) except HTMLParseError as e: # This error indicates that Python's HTML parser sucks. msg = "Got an error parsing HTML." # Pre-2.7.3 Pythons in particular have rather prickly HTML parsing. if sys.version_info[:3] < (2, 7, 3): msg += " Use Python 2.7.3 or newer for better HTML parsing." tty.warn(msg, url, "HTMLParseError: " + str(e)) except Exception as e: # Other types of errors are completely ignored, except in debug mode. tty.debug("Error in _spider: %s:%s" % (type(e), e), traceback.format_exc()) return pages, links
def run_instances(database, instances, filter_string, ubxlib_dir, working_dir, clean, summary_report_file, test_report_file, debug_file): '''Run the given instances''' return_value = 0 processes = [] platform_locks = [] misc_locks = {} alive_count = 0 report_thread = None report_queue = None reporter = None summary_report_file_path = None test_report_file_path = None debug_file_path = None summary_report_handle = None manager = Manager() # Create a lock to cover things that cross # platforms or that any process of u_run.main() # may need to perform outside of its working # directory misc_locks["system_lock"] = manager.RLock() # Create a lock which can be used on Nordic # platforms (nRF5 and Zephyer): performing a # JLink download to a board while JLink RTT logging # is active on any other board will often stop # the RTT logging even though the sessions are # aimed at debuggers with entirely different # serial numbers. misc_locks["jlink_lock"] = manager.RLock() # Create a "lock" that can be used on STM32F4 # platforms to ensure that all downloads are # completed before logging commences. We # can do this, rather than locking a tool for the # whole time as we have to do with Nordic, because # each STM32F4 board only runs a single instance misc_locks["stm32f4_downloads_list"] = manager.list() # It is possible for some platforms to be a bit # pants at running in multiple instances # hence here we create a lock per platform and pass it # into the instance for it to be able to manage # multiplicity if required create_platform_locks(database, instances, manager, platform_locks) # Launch a thread that prints stuff out # nicely from multiple sources print_queue = manager.Queue() print_thread = u_utils.PrintThread(print_queue) print_thread.start() # Set up a printer for this thread to print to the queue printer = u_utils.PrintToQueue(print_queue, None, True) if summary_report_file: # Launch a thread that manages reporting # from multiple sources summary_report_file_path = working_dir + os.sep + summary_report_file summary_report_handle = open(summary_report_file_path, "w") if summary_report_handle: printer.string("{}writing overall summary report to \"{}\".". \ format(PROMPT, summary_report_file_path)) else: printer.string("{}unable to open file \"{}\" for overall summary report.". \ format(PROMPT, summary_report_file_path)) report_queue = manager.Queue() report_thread = u_report.ReportThread(report_queue, summary_report_handle) report_thread.start() reporter = u_report.ReportToQueue(report_queue, None, None, printer) reporter.open() # From this post: # https://stackoverflow.com/questions/11312525/catch-ctrlc-sigint-and-exit-multiprocesses-gracefully-in-python # ...create a pool of worker processes to run our # instances, then they will handle sigint correctly # and tidy up after themselves. # SIGINT is ignored while the pool is created original_sigint_handler = signal(SIGINT, SIG_IGN) pool = NoDaemonPool(len(instances)) signal(SIGINT, original_sigint_handler) # Create locks for connections u_connection.init_locks(manager) try: # Set up all the instances for instance in instances: # Provide a working directory that is unique # for each instance and make sure it exists if working_dir: this_working_dir = working_dir + os.sep + \ INSTANCE_DIR_PREFIX + \ u_utils.get_instance_text(instance).replace(".", "_") else: this_working_dir = os.getcwd() + os.sep + \ INSTANCE_DIR_PREFIX + \ u_utils.get_instance_text(instance).replace(".", "_") if not os.path.isdir(this_working_dir): os.makedirs(this_working_dir) # Only clean the working directory if requested if clean: u_utils.deltree(this_working_dir, printer, PROMPT) os.makedirs(this_working_dir) # Create the file paths for this instance if summary_report_file: summary_report_file_path = this_working_dir + os.sep + summary_report_file if test_report_file: test_report_file_path = this_working_dir + os.sep + test_report_file if debug_file: debug_file_path = this_working_dir + os.sep + debug_file # Start u_run.main in each worker thread process = {} process["platform"] = u_data.get_platform_for_instance( database, instance) process["instance"] = instance process["platform_lock"] = None process["connection_lock"] = u_connection.get_lock(instance) for platform_lock in platform_locks: if process["platform"] == platform_lock["platform"]: process["platform_lock"] = platform_lock["lock"] break process["handle"] = pool.apply_async( u_run.main, (database, instance, filter_string, True, ubxlib_dir, this_working_dir, process["connection_lock"], process["platform_lock"], misc_locks, print_queue, report_queue, summary_report_file_path, test_report_file_path, debug_file_path)) alive_count += 1 processes.append(process.copy()) # Wait for all the launched processes to complete printer.string("{}all instances now launched.".format(PROMPT)) loop_count = 0 while alive_count > 0: for process in processes: instance_text = u_utils.get_instance_text(process["instance"]) if not "dealt_with" in process and process["handle"].ready(): try: # If the return value has gone negative, i.e. # an infrastructure failure, leave it there, # else add the number of test failures to it if (return_value >= 0 and process["handle"].get() > 0) or \ (return_value <= 0 and process["handle"].get() < 0): return_value += process["handle"].get() except KeyboardInterrupt as ex: raise KeyboardInterrupt from ex except Exception as ex: # If an instance threw an exception then flag an # infrastructure error return_value = -1 printer.string("{}instance {} threw exception \"{}:" \ " {}\" but I can't tell you where" \ " I'm afraid.". \ format(PROMPT, instance_text, type(ex).__name__, str(ex))) if reporter: reporter.event(u_report.EVENT_TYPE_INFRASTRUCTURE, u_report.EVENT_FAILED, "instance {} threw exception \"{}: {}\"". \ format(instance_text, type(ex).__name__, str(ex))) alive_count -= 1 process["dealt_with"] = True if not process["handle"].ready() and \ (loop_count == STILL_RUNNING_REPORT_SECONDS): printer.string("{}instance {} still running.". \ format(PROMPT, instance_text)) loop_count += 1 if loop_count > STILL_RUNNING_REPORT_SECONDS: loop_count = 0 sleep(1) except KeyboardInterrupt: # Pools can tidy themselves up on SIGINT printer.string( "{}caught CTRL-C, terminating instances...".format(PROMPT)) if reporter: reporter.event(u_report.EVENT_TYPE_INFRASTRUCTURE, u_report.EVENT_FAILED, "CTRL-C received, terminating") pool.terminate() return_value = -1 # Tidy up pool.close() pool.join() if reporter: reporter.event_extra_information("return value overall {} (0 = success, negative =" \ " probable infrastructure failure, positive =" \ " failure(s) (may still be due to infrastructure))". \ format(return_value)) reporter.close() # Wait for the print and report queues to empty # and stop the print process printer.string("{}all runs complete, return value {}.".format( PROMPT, return_value)) sleep(1) print_thread.stop_thread() print_thread.join() # Stop the reporting process if report_thread: report_thread.stop_thread() report_thread.join() if summary_report_handle: summary_report_handle.close() return return_value
def stest(self, repeat, threshold, mp): print("\n\tIntersection random subsampling test:\n Repeat " + str(repeat) + " times\n") self.test_time = repeat self.test_d = {} plist = OrderedDict() for ty in self.groupedreference.keys(): self.test_d[ty] = {} plist[ty] = OrderedDict() for r in self.groupedreference[ty]: if r.name in self.nalist: continue print("\t" + r.name) self.test_d[ty][r.name] = {} plist[ty][r.name] = OrderedDict() print("\t.", end="") sys.stdout.flush() for q in self.groupedquery[ty]: if r.name == q.name: continue else: print(".", end="") sys.stdout.flush() if q.name in self.nalist: continue # True intersection obs = self.counts[ty][r.name][q.name] qn = q.name if obs[2] == 0: aveinter, chisq, p = "NA", "NA", "1" else: com = q.combine(r, change_name=False, output=True) # Randomization d = [] inp = [ com, self.rlen[ty][r.name], self.mode_count, threshold ] mp_input = [inp for i in range(repeat)] pool = multiprocessing.Pool(processes=mp) mp_output = pool.map(mp_count_intersets, mp_input) pool.close() pool.join() # for i in range(repeat): # random_r,random_q = com.random_split(size=self.rlen[ty][r.name]) # d.append(random_r.intersect_count(random_q, mode_count=self.mode_count, threshold=threshold)) # d.append(count_intersect(random_r, random_q, mode_count=self.mode_count, threshold=threshold)) da = numpy.array(mp_output) exp_m = numpy.mean(da, axis=0) # print(exp_m) # print(obs) chisq, p, dof, expected = stats.chi2_contingency( [exp_m, obs]) aveinter = exp_m[2] plist[ty][r.name][qn] = p self.test_d[ty][r.name][qn] = [aveinter, chisq, p] print() multiple_correction(plist) # c_p = 0 for r in self.test_d[ty].keys(): if r in self.nalist: continue for q in self.test_d[ty][r].keys(): self.test_d[ty][r][q][2] = plist[ty][r][q]
def count_intersect(self, threshold, frequency=True): self.counts = OrderedDict() self.rlen, self.qlen = {}, {} self.nalist = [] if frequency: self.frequency = OrderedDict() # if self.mode_count == "bp": # print2(self.parameter, "\n{0}\t{1}\t{2}\t{3}\t{4}".format("Reference","Length(bp)", "Query", "Length(bp)", "Length of Intersection(bp)")) # elif self.mode_count == "count": # print2(self.parameter, "\n{0}\t{1}\t{2}\t{3}\t{4}".format("Reference","sequence_number", "Query", "sequence_number", "Number of Intersection")) for ty in self.groupedreference.keys(): self.counts[ty] = OrderedDict() self.rlen[ty], self.qlen[ty] = OrderedDict(), OrderedDict() if frequency: self.frequency[ty] = OrderedDict() for r in self.groupedreference[ty]: if r.total_coverage() == 0 and len(r) > 0: self.nalist.append(r.name) continue else: self.counts[ty][r.name] = OrderedDict() if self.mode_count == "bp": rlen = r.total_coverage() elif self.mode_count == "count": rlen = len(r) self.rlen[ty][r.name] = rlen mp_input = [] for q in self.groupedquery[ty]: if r.name == q.name: continue else: mp_input.append([ q, self.nalist, self.mode_count, self.qlen, threshold, self.counts, frequency, self.frequency, ty, r ]) # q, nalist, mode_count, qlen_dict, threshold, counts, frequency, self_frequency, ty, r pool = multiprocessing.Pool( processes=multiprocessing.cpu_count() - 1) mp_output = pool.map(mp_count_intersect, mp_input) pool.close() pool.join() # qname, nalist, qlen_dict[ty][q.name], counts[ty][r.name][q.name], self_frequency[ty][q.name].append(c[2]) for output in mp_output: if output[1]: self.nalist.append(output[1]) else: self.qlen[ty][output[0]] = output[2] self.counts[ty][r.name][output[0]] = output[3] # print(r.name) # print(output[0]) # print(output[3]) try: self.frequency[ty][output[0]][ r.name] = output[3][2] except: self.frequency[ty][output[0]] = {} self.frequency[ty][output[0]][ r.name] = output[3][2]
def learn(self, selected_data, verbose=None, n_min=1, limit=None, score_max=fpconst.PosInf, score_delta=fpconst.PosInf, cores=False, picloud=False): if verbose: print 'Learning parents of', selected_data.vertex.name, '...', v = selected_data.vertex nd = len(selected_data) parents = selected_data.parents p_weights = selected_data.weights n = len(parents) try: lim = int(limit) except TypeError: lim = n selected_data_empty = selected_data.subset([]) mindata = self.lower_bound_for_data_score(selected_data_empty) min_set = minset(n_min,score_max,score_delta,self.data_score(selected_data_empty)+\ self.graph_score(n,v,[],nd)) if n: w_min = p_weights[parents[0]] w_max = p_weights[parents[-1]] if w_min == w_max: if verbose: print "Using algorithm 2" weight = w_min size = 1 mg = self.graph_score(n, v, [weight], nd) while min_set.accepts(mg + mindata) and (size <= lim): if cores: import multiprocessing import multiprocessing.pool pool = multiprocessing.Pool(cores) sub_obj = list(self.subsets(parents, size)) import itertools results = pool.map(looper, [(selected_data, y, self) for y in sub_obj]) pool.close() pool.join() for result, sub in itertools.izip(results, sub_obj): min_set.add(mg + result, sub) else: for sub in self.subsets(parents, size): selected_data_sub = selected_data.subset(sub) min_set.add( mg + self.data_score(selected_data_sub), sub) size += 1 mg = self.graph_score(n, v, [weight] * size, nd) else: if verbose: print "Using algorithm 1" if cores: import multiprocessing import multiprocessing.pool pool = multiprocessing.Pool(cores) size = 1 results = [1] while (True in results) and (size <= lim): subs = list(self.subsets(parents, size)) scores = pool.map(looper, [(selected_data, y, self) for y in subs]) mgs = [] for sub in subs: weight = 0 for parent in sub: weight = weight + p_weights[parent] mgs.append(self.graph_score(n, v, [weight], nd)) import itertools for score, sub, mg in itertools.izip( scores, subs, mgs): min_set.add(mg + score, sub) results = pool.map(unwrap_min_set_accepts, [(min_set, mg + mindata) for mg in mgs]) del mgs, subs, scores size += 1 pool.close() pool.join() else: subsets = [] for parent in parents: heappush(subsets, (self.graph_score( n, v, [p_weights[parent]], nd), [p_weights[parent]], [parent])) while subsets: mg, weights, sub = heappop(subsets) if not min_set.accepts(mg + mindata): break selected_data_sub = selected_data.subset(sub) min_set.add(mg + self.data_score(selected_data_sub), sub) if len(sub) < lim: last_parent = parents.index(sub[-1]) for parent in parents[last_parent + 1:]: sub_succ = sub + [parent] weights_succ = weights + [p_weights[parent]] mg_succ = self.graph_score( n, v, weights_succ, nd) heappush(subsets, (mg_succ, weights_succ, sub_succ)) if verbose: print 'done', min_set return min_set.optimal, min_set.tolist()
def learn_1(self, selected_data, verbose=None, n_min=1, limit=None, score_max=fpconst.PosInf, score_delta=fpconst.PosInf, cores=False, picloud=False): if verbose: print 'Learning parents of', selected_data.vertex.name, '...', # if not self.sloops: # selected_data.rm_sloops() v = selected_data.vertex nd = len(selected_data) parents = selected_data.parents p_weights = selected_data.weights n = len(parents) try: lim = int(limit) except TypeError: #limit was None lim = n selected_data_empty = selected_data.subset([]) mindata = self.lower_bound_for_data_score(selected_data_empty) min_set = minset(n_min,score_max,score_delta,self.data_score(selected_data_empty)+\ self.graph_score(n,v,[],nd)) #empty parents set if n: # are there any potential parents? w_min = p_weights[parents[0]] w_max = p_weights[parents[-1]] if w_min == w_max: # we can use algorithm 2 if verbose: print "Using algorithm 2" weight = w_min size = 1 mg = self.graph_score(n, v, [weight], nd) while min_set.accepts(mg + mindata) and ( size <= lim): #we can possibly add (sub-)optimal scores # Parallelized version if cores: import multiprocessing import multiprocessing.pool pool = multiprocessing.Pool(cores) sub_obj = list(self.subsets(parents, size)) import itertools results = pool.map(looper, [(selected_data, y, self) for y in sub_obj]) pool.close() pool.join() for result, sub in itertools.izip(results, sub_obj): min_set.add(mg + result, sub) else: for sub in self.subsets(parents, size): #print "sub.size ", len(sub) selected_data_sub = selected_data.subset(sub) min_set.add( mg + self.data_score(selected_data_sub), sub) size += 1 mg = self.graph_score(n, v, [weight] * size, nd) else: # we have to use algorithm 1 if verbose: print "Using algorithm 1" # Parallelized version if cores: import multiprocessing import multiprocessing.pool pool = multiprocessing.Pool(cores) size = 1 results = [1] while (True in results) and (size <= lim): subs = list(self.subsets(parents, size)) scores = pool.map(looper, [(selected_data, y, self) for y in subs]) mgs = [] for sub in subs: weight = 0 for parent in sub: weight = weight + p_weights[parent] mgs.append(self.graph_score(n, v, [weight], nd)) import itertools for score, sub, mg in itertools.izip( scores, subs, mgs): min_set.add(mg + score, sub) results = pool.map(unwrap_min_set_accepts, [(min_set, mg + mindata) for mg in mgs]) del mgs, subs, scores size += 1 pool.close() pool.join() else: subsets = [ ] #successors of considered yet potential parents sets for parent in parents: #one-element parents sets #print "one parent" heappush(subsets, (self.graph_score( n, v, [p_weights[parent]], nd), [p_weights[parent]], [parent])) while subsets: #print subsets mg, weights, sub = heappop(subsets) #print sub if not min_set.accepts( mg + mindata): #we cannot improve the score break selected_data_sub = selected_data.subset(sub) min_set.add(mg + self.data_score(selected_data_sub), sub) #insert sub's successors if len(sub) < lim: last_parent = parents.index(sub[-1]) for parent in parents[last_parent + 1:]: sub_succ = sub + [parent] weights_succ = weights + [p_weights[parent]] mg_succ = self.graph_score( n, v, weights_succ, nd) heappush(subsets, (mg_succ, weights_succ, sub_succ)) if verbose: print 'done', min_set return min_set.optimal, min_set.tolist()
def buildDecisionTree(df, root, file, config, dataset_features, parent_level=0, leaf_id=0, parents='root'): models = [] enableParallelism = config['enableParallelism'] algorithm = config['algorithm'] json_file = file.split(".")[0] + ".json" if root == 1: if config['enableRandomForest'] != True and config[ 'enableGBM'] != True and config['enableAdaboost'] != True: raw_df = df.copy() #-------------------------------------- df_copy = df.copy() winner_name = findDecision(df, config) #find winner index, this cannot be returned by find decision because columns dropped in previous steps j = 0 for i in dataset_features: if i == winner_name: winner_index = j j = j + 1 numericColumn = False if dataset_features[winner_name] != 'object': numericColumn = True #restoration columns = df.shape[1] for i in range(0, columns - 1): column_name = df.columns[i] column_type = df[column_name].dtypes if column_type != 'object' and column_name != winner_name: df[column_name] = df_copy[column_name] classes = df[winner_name].value_counts().keys().tolist() #----------------------------------------------------- #TO-DO: you should specify the number of cores in config num_cores = int(multiprocessing.cpu_count() / 2) #allocate half of your total cores input_params = [] #serial approach for i in range(0, len(classes)): current_class = classes[i] subdataset = df[df[winner_name] == current_class] subdataset = subdataset.drop(columns=[winner_name]) branch_index = i * 1 #create branches serially if enableParallelism != True: createBranch(config, current_class, subdataset, numericColumn, branch_index, winner_index, root, parents, file, dataset_features) else: input_params.append((config, current_class, subdataset, numericColumn, branch_index, winner_index, root, parents, file, dataset_features)) #--------------------------- #add else condition in the decision tree if df.Decision.dtypes == 'object': #classification pivot = pd.DataFrame(subdataset.Decision.value_counts()).reset_index() pivot = pivot.rename(columns={ "Decision": "Instances", "index": "Decision" }) pivot = pivot.sort_values(by=["Instances"], ascending=False).reset_index() else_decision = "return '%s'" % (pivot.iloc[0].Decision) if enableParallelism != True: functions.storeRule(file, (functions.formatRule(root), "else:")) functions.storeRule( file, (functions.formatRule(root + 1), else_decision)) else: #parallelism leaf_id = str(uuid.uuid1()) custom_rule_file = "outputs/rules/" + str(leaf_id) + ".txt" check_rule = "else: " + else_decision sample_rule = " {\n" sample_rule += " \"current_level\": " + str(root) + ",\n" sample_rule += " \"leaf_id\": \"" + str(leaf_id) + "\",\n" sample_rule += " \"parents\": \"" + parents + "\",\n" sample_rule += " \"rule\": \"" + check_rule + "\"\n" sample_rule += " }" functions.createFile(custom_rule_file, "") functions.storeRule(custom_rule_file, sample_rule) else: #regression else_decision = "return %s" % (subdataset.Decision.mean()) if enableParallelism != True: functions.storeRule(file, (functions.formatRule(root), "else:")) functions.storeRule( file, (functions.formatRule(root + 1), else_decision)) else: leaf_id = str(uuid.uuid1()) custom_rule_file = "outputs/rules/" + str(leaf_id) + ".txt" check_rule = "else: " + else_decision sample_rule = " {\n" sample_rule += " \"current_level\": " + str(root) + ",\n" sample_rule += " \"leaf_id\": \"" + str(leaf_id) + "\",\n" sample_rule += " \"parents\": \"" + parents + "\",\n" sample_rule += " \"rule\": \"" + check_rule + "\"\n" sample_rule += " }" functions.createFile(custom_rule_file, "") functions.storeRule(custom_rule_file, sample_rule) #--------------------------- #create branches in parallel if enableParallelism == True: """ #this usage causes trouble for recursive functions with Pool(number_of_cpus) as pool: pool.starmap(createBranch, input_params) """ pool = MyPool(num_cores) results = pool.starmap(createBranch, input_params) pool.close() pool.join() #--------------------------------------------- #calculate accuracy metrics if root == 1: if enableParallelism == True: #custom rules are stored in .txt files. merge them all in a json file functions.createFile(json_file, "[\n") custom_rules = [] file_index = 0 for file in os.listdir(os.getcwd() + "/outputs/rules"): if file.endswith(".txt"): custom_rules.append(os.getcwd() + "/outputs/rules/" + file) #print(file) #this file stores a custom rule f = open(os.getcwd() + "/outputs/rules/" + file, "r") custom_rule = f.read() if file_index > 0: custom_rule = ", " + custom_rule functions.storeRule(json_file, custom_rule) f.close() file_index = file_index + 1 functions.storeRule(json_file, "]") #----------------------------------- #custom rules are already merged in a json file. clear messy custom rules #TO-DO: if random forest trees are handled in parallel, this would be a problem. You cannot know the related tree of a rule. You should store a global tree id in a rule. for file in custom_rules: os.remove(file) #----------------------------------- reconstructRules(json_file) #----------------------------------- if config['enableRandomForest'] != True and config[ 'enableGBM'] != True and config['enableAdaboost'] != True: #this is reguler decision tree. find accuracy here. moduleName = "outputs/rules/rules" fp, pathname, description = imp.find_module(moduleName) myrules = imp.load_module(moduleName, fp, pathname, description) #rules0 models.append(myrules) num_of_features = df.shape[1] - 1 instances = df.shape[0] classified = 0 mae = 0 mse = 0 #instead of for loops, pandas functions perform well raw_df['Prediction'] = raw_df.apply(findPrediction, axis=1) if algorithm != 'Regression': idx = raw_df[raw_df['Prediction'] == raw_df['Decision']].index #raw_df['Classified'] = 0 #raw_df.loc[idx, 'Classified'] = 1 #print(raw_df) accuracy = 100 * len(idx) / instances print("Accuracy: ", accuracy, "% on ", instances, " instances") else: raw_df['Absolute_Error'] = abs(raw_df['Prediction'] - raw_df['Decision']) raw_df['Absolute_Error_Squared'] = raw_df[ 'Absolute_Error'] * raw_df['Absolute_Error'] #print(raw_df) mae = raw_df['Absolute_Error'].sum() / instances print("MAE: ", mae) mse = raw_df['Absolute_Error_Squared'].sum() / instances rmse = math.sqrt(mse) print("RMSE: ", rmse) mean = raw_df['Decision'].mean() print("Mean: ", mean) if mean > 0: print("MAE / Mean: ", 100 * mae / mean, "%") print("RMSE / Mean: ", 100 * rmse / mean, "%") return models
def pool_slice(func, data_in, args=(), kwds={}, num_workers=None, thread_abort=None, logfile=None, num_batches=1, progress=0): """ Process data in slices using a pool of workers and return the results. The individual worker results are returned in the same order as the original input data, irrespective of the order in which the workers finished (FIFO). Progress percentage is written to optional logfile using a background thread that monitors a queue. Note that 'func' is supposed to periodically check thread_abort.event which is passed as the first argument to 'func', and put its progress percentage into the queue which is passed as the second argument to 'func'. """ from config import getcfg if num_workers is None: num_workers = cpu_count() num_workers = max(min(int(num_workers), len(data_in)), 1) max_workers = getcfg("multiprocessing.max_cpus") if max_workers: num_workers = min(num_workers, max_workers) if num_workers == 1 or not num_batches: # Splitting the workload into batches only makes sense if there are # multiple workers num_batches = 1 chunksize = float(len(data_in)) / (num_workers * num_batches) if chunksize < 1: num_batches = 1 chunksize = float(len(data_in)) / num_workers if num_workers > 1: Pool = NonDaemonicPool manager = mp.Manager() if thread_abort is not None and not isinstance(thread_abort.event, mp.managers.EventProxy): # Replace the event with a managed instance that is compatible # with pool event = thread_abort.event thread_abort.event = manager.Event() if event.is_set(): thread_abort.event.set() else: event = None Queue = manager.Queue else: # Do it all in in the main thread of the current instance Pool = FakePool manager = None Queue = FakeQueue if thread_abort is not None: thread_abort_event = thread_abort.event else: thread_abort_event = None progress_queue = Queue() if logfile: def progress_logger(num_workers, progress=0.0): eof_count = 0 prevperc = -1 while progress < 100 * num_workers: try: inc = progress_queue.get(True, 0.1) if isinstance(inc, Exception): raise inc progress += inc except Empty: continue except IOError: break except EOFError: eof_count += 1 if eof_count == num_workers: break perc = round(progress / num_workers) if perc > prevperc: logfile.write("\r%i%%" % perc) prevperc = perc threading.Thread(target=progress_logger, args=(num_workers * num_batches, progress * num_workers * num_batches), name="ProcessProgressLogger").start() pool = Pool(num_workers) results = [] start = 0 for batch in range(num_batches): for i in range(batch * num_workers, (batch + 1) * num_workers): end = int(math.ceil(chunksize * (i + 1))) results.append( pool.apply_async( WorkerFunc(func, batch == num_batches - 1), (data_in[start:end], thread_abort_event, progress_queue) + args, kwds)) start = end # Get results exception = None data_out = [] for result in results: result = result.get() if isinstance(result, Exception): exception = result continue data_out.append(result) pool.close() pool.join() if manager: # Need to shutdown manager so it doesn't hold files in use if event: # Restore original event if thread_abort.event.is_set(): event.set() thread_abort.event = event manager.shutdown() if exception: raise exception return data_out
def build_variable(type: str, ds: DesignSpaceDocument) -> None: output = Path("fonts/ttf") if type == "latin": for instance in ds.instances: instance.name = instance.name.replace("Code", "Code Latin") instance.familyName = instance.familyName.replace( "Code", "Code Latin") if instance.styleMapFamilyName: instance.styleMapFamilyName = str( instance.styleMapFamilyName).replace("Code", "Code Latin") varFont = ufo2ft.compileVariableTTF(ds) styleSpace = statmake.classes.Stylespace.from_file( "sources/Latin_STAT.plist") statmake.lib.apply_stylespace_to_variable_font(styleSpace, varFont, {}) DSIG_modification(varFont) varFont["name"].setName("Mplus Code Latin", 1, 3, 1, 1033) varFont["name"].setName("UFDN;MplusCodeLatin-Regular", 3, 3, 1, 1033) varFont["name"].setName("Mplus Code Latin Regular", 4, 3, 1, 1033) varFont["name"].setName("MplusCodeLatin-Regular", 6, 3, 1, 1033) varFont.save(output / "MplusCodeLatin[wdth,wght].ttf") autohint(output / "MplusCodeLatin[wdth,wght].ttf") prefix = "MplusCodeLatin" if type == "one" or type == "two": print("[MPLUS " + type + "] Importing Kanji") for source in ds.sources: if "{" not in source.name: step_merge_glyphs_from_ufo( Path("sources/M+1p-" + source.filename[7:-4] + ".ufo"), source.font) source.font.features.text = Path( "sources/features.fea").read_text() print("[MPLUS " + type + "] Importing Kanji replacement rules") kanji_ds = DesignSpaceDocument.fromfile( "sources/MPLUS-Kanji.designspace") for rule in kanji_ds.rules: ds.rules.append(rule) print("[MPLUS " + type + "] Building") varFont = ufo2ft.compileVariableTTF(ds) styleSpace = statmake.classes.Stylespace.from_file( "sources/MPLUS_STAT.plist") statmake.lib.apply_stylespace_to_variable_font(styleSpace, varFont, {}) DSIG_modification(varFont) print("[MPLUS " + type + "] Saving") if type == "one": varFont.save(output / "Mplus1[wght].ttf") autohint(output / "Mplus1[wght].ttf") prefix = "Mplus1" elif type == "two": varFont.save(output / "Mplus2[wght].ttf") autohint(output / "Mplus2[wght].ttf") prefix = "Mplus2" if type == "code": for instance in ds.instances: instance.name = instance.name.replace("Mplus", "Mplus 1 ") instance.familyName = instance.familyName.replace( "Mplus", "Mplus 1 ") if instance.styleMapFamilyName: instance.styleMapFamilyName = instance.styleMapFamilyName.replace( "MplusCode", "Mplus 1 Code") print("[MPLUS " + type + "] Importing glyphs") for source in ds.sources: if "{" not in source.name: step_merge_glyphs_from_ufo( Path("sources/Mplus1-" + str(source.name).split(" ")[2] + ".ufo"), source.font, "sources/kana_glyphs.txt") step_merge_glyphs_from_ufo( Path("sources/M+1p-" + str(source.name).split(" ")[2] + ".ufo"), source.font) source.name = source.name.replace("Mplus", "Mplus 1") source.font.features.text = Path("sources/code.fea").read_text() print("[MPLUS " + type + "] Importing Kanji replacement rules") kanji_ds = DesignSpaceDocument.fromfile( "sources/MPLUS-Kanji.designspace") for rule in kanji_ds.rules: ds.rules.append(rule) print("[MPLUS " + type + "] Building") varFont = ufo2ft.compileVariableTTF(ds) styleSpace = statmake.classes.Stylespace.from_file( "sources/MPLUS_STAT.plist") statmake.lib.apply_stylespace_to_variable_font(styleSpace, varFont, {}) DSIG_modification(varFont) varFont["name"].setName("Mplus 1 Code", 1, 3, 1, 1033) varFont["name"].setName("UFDN;Mplus1Code-Regular", 3, 3, 1, 1033) varFont["name"].setName("Mplus 1 Code Regular", 4, 3, 1, 1033) varFont["name"].setName("Mplus1Code-Regular", 6, 3, 1, 1033) print("[MPLUS " + type + "] Saving") varFont.save(output / "Mplus1Code[wght].ttf") autohint(output / "Mplus1Code[wght].ttf") prefix = "Mplus1Code" generator = fontmake.instantiator.Instantiator.from_designspace(ds) pool = multiprocessing.pool.Pool(processes=multiprocessing.cpu_count()) processes = [] for instance_descriptor in ds.instances: # GOTTA GO FAST processes.append( pool.apply_async( make_static, (instance_descriptor, generator, prefix), )) pool.close() pool.join() for process in processes: process.get() del processes, pool
def filter(self, items: Iterable[Any]) -> Iterable[Any]: if len(self._filters) == 0: return items try: with Manager() as manager: stdout_queue = manager.Queue() #type: ignore stdlog_queue = manager.Queue() #type: ignore stdout_writer, stdout_reader = QueueSink( stdout_queue), QueueSource(stdout_queue) stdlog_writer, stdlog_reader = QueueSink( stdlog_queue), QueueSource(stdlog_queue) class MyPool(multiprocessing.pool.Pool): _missing_error_definition_error_is_new = True def _join_exited_workers(self): for worker in self._pool: if worker.exitcode == 1000 and MyPool._missing_error_definition_error_is_new: #this is a hack... This only works so long as we just #process one job at a time... This is true in our case. #this is necessary because multiprocessing can get stuck #waiting for failed workers and that is frustrating for users. MyPool._missing_error_definition_error_is_new = False message = ( "Coba attempted to evaluate your benchmark in multiple processes but the pickle module was unable to " "find all the definitions needed to pass the tasks to the processes. The two most common causes of " "this error are: 1) a learner or simulation is defined in a Jupyter Notebook cell or 2) a necessary " "class definition exists inside the `__name__=='__main__'` code block in the main execution script. In " "either case there are two simple solutions: 1) evalute your benchmark in a single processed with no " "limit on child tasks or 2) define all you classes in a separate python file that is imported when " "evaluating.") CobaConfig.Logger.log(message) if worker.exitcode is not None and worker.exitcode != 0: #A worker exited in an uncontrolled manner and was unable to clean its job #up. We therefore mark one of the jobs as "finished" but failed to prevent an #infinite wait on a failed job to finish that is actually no longer running. list(self._cache.values())[0]._set( None, (False, None)) return super()._join_exited_workers() with MyPool(self._processes, maxtasksperchild=self._maxtasksperchild) as pool: # handle not picklable (this is handled by done_or_failed) # handle empty list (this is done by checking result.ready()) # handle exceptions in process (unhandled exceptions can cause children to hang so we pass them to stderr) # handle ctrl-c without hanging # > don't call result.get when KeyboardInterrupt has been hit # > handle EOFError,BrokenPipeError errors with queue since ctr-c kills manager # handle AttributeErrors. These occure when... (this is handled by shadowing several pool methods) # > a class that is defined in a Jupyter Notebook cell is pickled # > a class that is defined inside the __name__=='__main__' block is pickeled # handle Benchmark.evaluate not being called inside of __name__=='__main__' (this is handled by a big try/catch) def done_or_failed(results_or_exception=None): #This method is called one time at the completion of map_async #in the case that one of our jobs threw an exception the argument #will contain an exception otherwise it will be the returned results #of all the jobs. This method is executed on a thread in the Main context. if isinstance(results_or_exception, Exception): from coba.config import CobaConfig if "Can't pickle" in str( results_or_exception) or "Pickling" in str( results_or_exception): message = ( str(results_or_exception) + ". Coba attempted to process your Benchmark on multiple processes and " "the named class was not able to be pickled. This problem can be fixed in one of two ways: 1) " "evaluate the benchmark in question on a single process with no limit on the tasks per child or 2) " "modify the named class to be picklable. The easiest way to make the given class picklable is to " "add `def __reduce__ (self) return (<the class in question>, (<tuple of constructor arguments>))` to " "the class. For more information see https://docs.python.org/3/library/pickle.html#object.__reduce__." ) CobaConfig.Logger.log(message) else: CobaConfig.Logger.log_exception( results_or_exception) stdout_writer.write([None]) stdlog_writer.write([None]) log_thread = Thread(target=Pipe.join( stdlog_reader, [], CobaConfig.Logger.sink).run) log_thread.daemon = True log_thread.start() processor = MultiprocessFilter.Processor( self._filters, stdout_writer, stdlog_writer, self._processes) result = pool.map_async(processor.process, items, callback=done_or_failed, error_callback=done_or_failed, chunksize=1) # When items is empty finished_callback will not be called and we'll get stuck waiting for the poison pill. # When items is empty ready() will be true immediately and this check will place the poison pill into the queues. if result.ready(): done_or_failed() try: for item in stdout_reader.read(): yield item pool.close() except (KeyboardInterrupt, Exception): try: pool.terminate() except: pass raise finally: pool.join() log_thread.join() except RuntimeError as e: #This happens when importing main causes this code to run again raise CobaFatal(str(e))
def preprocess_batch(tls_type='webster'): # Preprocess is a run with some presets # Read script arguments from run.config file. data = {} if do_preprocess(): num_processors, num_runs, seeds = config_parser.parse_run_params(print_params=False) if len(seeds) != num_runs: raise configparser.Error('Number of seeds in run.config `seeds`' ' must match the number of runs (`num_runs`) argument.') # Assess total number of processors. processors_total = mp.cpu_count() print(f'Total number of processors available: {processors_total}\n') # Adjust number of processors. if num_processors > processors_total: num_processors = processors_total print(f'Number of processors downgraded to {num_processors}\n') # num_processors should be <= num_runs seeds = seeds[:num_processors] print('Arguments (preprocess):') print('-----------------------') print(f'Number of runs: {num_runs}') print(f'Number of processors: {num_processors}') print(f'Number of preprocess trails: {num_processors}\n') # Read train.py arguments from train.config file. preprocess_config = configparser.ConfigParser() preprocess_path = CONFIG_PATH / 'train.config' preprocess_config.read(str(preprocess_path)) # Setup sumo-tls-type. preprocess_config.set('train_args', 'tls_type', tls_type) preprocess_config.set('train_args', 'experiment_save_agent', str(False)) preprocess_config.set('mdp_args', 'discretize_state_space', str(False)) # Get feature & network information network = preprocess_config.get('train_args', 'network') features = eval(preprocess_config.get('mdp_args', 'features')) if eval(preprocess_config.get('mdp_args', 'time_period')) is not None: features = ('time',) + features # Remove lag from features features = tuple(rmlag(f) for f in features) # Override train configurations with test parameters. test_config = configparser.ConfigParser() test_path = CONFIG_PATH / 'test.config' test_config.read(test_path.as_posix()) horizon = int(test_config.get('test_args', 'rollout-time')) preprocess_config.set('train_args', 'experiment_time', str(horizon)) # Write .xml files for test plots creation. preprocess_config.set('train_args', 'sumo_emission', str(False)) timestamp = datetime.now().strftime('%Y%m%d%H%M%S.%f') print(f'Experiment timestamp: {timestamp}') with tempfile.TemporaryDirectory() as tmp_dir: # Create a config file for each train.py # with the respective seed. These config # files are stored in a temporary directory. tmp_path = Path(tmp_dir) preprocess_configs = [] for seed in seeds: cfg_path = tmp_path / f'{tls_type}-{seed}.config' preprocess_configs.append(cfg_path) # Setup train seed. preprocess_config.set("train_args", "experiment_seed", str(seed + 1)) # Write temporary train config file. with cfg_path.open('w') as ft: preprocess_config.write(ft) # rvs: directories' names holding experiment data if num_processors > 1: ind = range(num_processors) cfgs = preprocess_configs packed_args = zip(ind, cfgs) pool = NonDaemonicPool(num_processors) rvs = pool.map(delay_preprocess, packed_args) pool.close() pool.join() else: rvs = [] for cfg in preprocess_configs: rvs.append(delay_preprocess((0.0, cfg))) data = defaultdict(list) for ret in rvs: data[(network, features)] += ret['observation_spaces'] data = digitize2(data) return data
def Search(self, date, table='events', coverage=False, translation=False, output=None, queryTime=datetime.datetime.now().strftime('%m-%d-%Y %H:%M:%S'), normcols=False): """Core searcher method to set parameters for GDELT data searches Keyword arguments ---------- date : str, required The string representation of a datetime (single) or date range (list of strings) that is (are) the targeted timelines to pull GDELT data. table : string,{'events','gkg','mentions'} Select from the table formats offered by the GDELT service: * events (1.0 and 2.0) The biggest difference between 1.0 and 2.0 are the update frequencies. 1.0 data is disseminated daily, and the most recent data will be published at 6AM Eastern Standard time of the next day. So, 21 August 2016 results would be available 22 August 2016 at 6AM EST. 2.0 data updates every 15 minutes of the current day. Version 1.0 runs from January 1, 1979 through March 31, 2013 contains 57 fields for each record. The Daily Updates collection, which begins April 1, 2013 and runs through present, contains an additional field at the end of each record, for a total of 58 fields for each record. The format is dyadic CAMEO format, capturing two actors and the action performed by Actor1 upon Actor2. Version 2.0 only covers February 19, 2015 onwards, and is stored in an expanded version of the dyadic CAMEO format . See http://data.gdeltproject.org/documentation/GDELT-Event_ Codebook-V2.0.pdf for more information. * gkg (1.0 and 2.0) **Warning** These tables and queries can be extremely large and consume a lot of RAM. Consider running a single days worth of gkg pulls, store to disc, flush RAM, then proceed to the next day. Table that represents all of the latent dimensions, geography, and network structure of the global news. It applies an array of highly sophisticated natural language processing algorithms to each document to compute a range of codified metadata encoding key latent and contextual dimensions of the document. Version 2.0 includes Global Content Analysis Measures (GCAM) which reportedly provides 24 emotional measurement packages that assess more than 2,300 emotions and themes from every article in realtime, multilingual dimensions natively assessing the emotions of 15 languages (Arabic, Basque, Catalan, Chinese, French, Galician, German, Hindi, Indonesian, Korean, Pashto, Portuguese, Russian, Spanish, and Urdu).See documentation about GKG 1.0 at http://data.gdeltproject.org/documentation/GDELT- Global_Knowledge_Graph_Codebook.pdf, and GKG 2.0 at http:// data.gdeltproject.org/documentation/GDELT-Global_Knowledge_ Graph_Codebook-V2.1.pdf. * mentions (2.0 only) Mentions table records every mention of an event over time, along with the timestamp the article was published. This allows the progression of an event through the global media to be tracked, identifying outlets that tend to break certain kinds of events the earliest or which may break stories later but are more accurate in their reporting on those events. Combined with the 15 minute update resolution and GCAM, this also allows the emotional reaction and resonance of an event to be assessed as it sweeps through the world’s media. coverage : bool, default: False When set to 'True' and the GDELT version parameter is set to 2, gdeltPyR will pull back every 15 minute interval in the day ( full results) or, if pulling for the current day, pull all 15 minute intervals up to the most recent 15 minute interval of the current our. For example, if the current date is 22 August, 2016 and the current time is 0828 HRs Eastern, our pull would get pull every 15 minute interval in the day up to 0815HRs. When coverate is set to true and a date range is entered, we pull every 15 minute interval for historical days and up to the most recent 15 minute interval for the current day, if that day is included. translation : bool, default: False Whether or not to pull the translation database available from version 2 of GDELT. If translation is True, the translated set is downloaded, if set to False the english set is downloaded. queryTime : datetime object, system generated This records the system time when gdeltPyR's query was executed, which can be used for logging purposes. output : string, {None,'df','gpd','shp','shapefile', 'json', 'geojson' 'r','geodataframe'} Select the output format for the returned GDELT data Options ------- json - Javascript Object Notation output; returns list of dictionaries in Python or a list of json objects r - writes the cross language dataframe to the current directory. This uses the Feather library found at https://github.com/wesm/ feather. This option returns a pandas dataframe but write the R dataframe to the current working directory. The filename includes all the parameters used to launch the query: version, coverage, table name, query dates, and query time. csv- Outputs a CSV format; all dates and columns are joined shp- Writes an ESRI shapefile to current directory or path; output is filtered to exclude rows with no latitude or longitude geojson- geodataframe- Returns a geodataframe; output is filtered to exclude rows with no latitude or longitude. This output can be manipulated for geoprocessing/geospatial operations such as reprojecting the coordinates, creating a thematic map (choropleth map), merging with other geospatial objects, etc. See http://geopandas.org/ for info. normcols : bool Applies a generic lambda function to normalize GDELT columns for compatibility with SQL or Shapefile outputs. Examples -------- >>> from gdelt >>> gd = gdelt.gdelt(version=1) >>> results = gd.Search(['2016 10 19'],table='events',coverage=True) >>> print(len(results)) 244767 >>> gd = gdelt.gdelt(version=2) >>> results = gd.Search(['2016 Oct 10'], table='gkg') >>> print(len(results)) 2398 >>> print(results.V2Persons.ix[2]) Juanita Broaddrick,1202;Monica Lewinsky,1612;Donald Trump,12;Donald Trump,244;Wolf Blitzer,1728;Lucianne Goldberg,3712;Linda Tripp,3692; Bill Clinton,47;Bill Clinton,382;Bill Clinton,563;Bill Clinton,657;Bill Clinton,730;Bill Clinton,1280;Bill Clinton,2896;Bill Clinton,3259;Bill Clinton,4142;Bill Clinton,4176;Bill Clinton,4342;Ken Starr,2352;Ken Starr,2621;Howard Stern,626;Howard Stern,4286;Robin Quivers,4622; Paula Jones,3187;Paula Jones,3808;Gennifer Flowers,1594;Neil Cavuto, 3362;Alicia Machado,1700;Hillary Clinton,294;Hillary Clinton,538; Hillary Clinton,808;Hillary Clinton,1802;Hillary Clinton,2303;Hillary Clinton,4226 >>> results = gd.Search(['2016 Oct 10'], table='gkg',output='r') Notes ------ Read more about GDELT data at http://gdeltproject.org/data.html gdeltPyR retrieves Global Database of Events, Language, and Tone (GDELT) data (version 1.0 or version 2.0) via parallel HTTP GET requests and is an alternative to accessing GDELT data via Google BigQuery. Performance will vary based on the number of available cores (i.e. CPUs), internet connection speed, and available RAM. For systems with limited RAM, Later iterations of gdeltPyR will include an option to store the output directly to disc. """ # check for valid table names; fail early valid = ['events', 'gkg', 'vgkg', 'iatv', 'mentions'] if table not in valid: raise ValueError( 'You entered "{}"; this is not a valid table name.' ' Choose from "events", "mentions", or "gkg".'.format(table)) _date_input_check(date, self.version) self.coverage = coverage self.date = date version = self.version baseUrl = self.baseUrl self.queryTime = queryTime self.table = table self.translation = translation self.datesString = _gdeltRangeString(_dateRanger(self.date), version=version, coverage=self.coverage) ################################# # R dataframe check; fail early ################################# if output == 'r': # pragma: no cover try: import feather except ImportError: raise ImportError(('You need to install `feather` in order ' 'to output data as an R dataframe. Keep ' 'in mind the function will return a ' 'pandas dataframe but write the R ' 'dataframe to your current working ' 'directory as a `.feather` file. Install ' 'by running\npip install feather\nor if ' 'you have Anaconda (preferred)\nconda ' 'install feather-format -c conda-forge\nTo ' 'learn more about the library visit https:/' '/github.com/wesm/feather')) ################################## # Partial Functions ################################# v1RangerCoverage = partial(_gdeltRangeString, version=1, coverage=True) v2RangerCoverage = partial(_gdeltRangeString, version=2, coverage=True) v1RangerNoCoverage = partial(_gdeltRangeString, version=1, coverage=False) v2RangerNoCoverage = partial(_gdeltRangeString, version=2, coverage=False) urlsv1gkg = partial(_urlBuilder, version=1, table='gkg') urlsv2mentions = partial(_urlBuilder, version=2, table='mentions', translation=self.translation) urlsv2events = partial(_urlBuilder, version=2, table='events', translation=self.translation) urlsv1events = partial(_urlBuilder, version=1, table='events') urlsv2gkg = partial(_urlBuilder, version=2, table='gkg', translation=self.translation) eventWork = partial(_mp_worker, table='events', proxies=self.proxies) codeCams = partial(_cameos, codes=codes) ##################################### # GDELT Version 2.0 Headers ##################################### if int(self.version) == 2: ################################### # Download 2.0 Headers ################################### if self.table == 'events': try: self.events_columns = \ pd.read_csv(os.path.join(BASE_DIR, "data", 'events2.csv'))[ 'name'].values.tolist() except: # pragma: no cover self.events_columns = _events2Heads() elif self.table == 'mentions': try: self.mentions_columns = \ pd.read_csv( os.path.join(BASE_DIR, "data", 'mentions.csv'))[ 'name'].values.tolist() except: # pragma: no cover self.mentions_columns = _mentionsHeads() else: try: self.gkg_columns = \ pd.read_csv( os.path.join(BASE_DIR, "data", 'gkg2.csv'))[ 'name'].values.tolist() except: # pragma: no cover self.gkg_columns = _gkgHeads() ##################################### # GDELT Version 1.0 Analytics, Header, Downloads ##################################### if int(self.version) == 1: if self.table is "mentions": raise ValueError('GDELT 1.0 does not have the "mentions"' ' table. Specify the "events" or "gkg"' 'table.') if self.translation: raise ValueError('GDELT 1.0 does not have an option to' ' return translated table data. Switch to ' 'version 2 by reinstantiating the gdelt ' 'object with <gd = gdelt.gdelt(version=2)>') else: pass try: self.events_columns = \ pd.read_csv(os.path.join(BASE_DIR, "data", 'events1.csv'))[ 'name'].values.tolist() except: # pragma: no cover self.events_columns = _events1Heads() columns = self.events_columns if self.table == 'gkg': self.download_list = (urlsv1gkg( v1RangerCoverage(_dateRanger(self.date)))) elif self.table == 'events' or self.table == '': if self.coverage is True: # pragma: no cover self.download_list = (urlsv1events( v1RangerCoverage(_dateRanger(self.date)))) else: # print("I'm here at line 125") self.download_list = (urlsv1events( v1RangerNoCoverage(_dateRanger(self.date)))) else: # pragma: no cover raise Exception('You entered an incorrect table type for ' 'GDELT 1.0.') ##################################### # GDELT Version 2.0 Analytics and Download ##################################### elif self.version == 2: if self.table == 'events' or self.table == '': columns = self.events_columns if self.coverage is True: # pragma: no cover self.download_list = (urlsv2events( v2RangerCoverage(_dateRanger(self.date)))) else: self.download_list = (urlsv2events( v2RangerNoCoverage(_dateRanger(self.date)))) if self.table == 'gkg': columns = self.gkg_columns if self.coverage is True: # pragma: no cover self.download_list = (urlsv2gkg( v2RangerCoverage(_dateRanger(self.date)))) else: self.download_list = (urlsv2gkg( v2RangerNoCoverage(_dateRanger(self.date)))) # print ("2 gkg", urlsv2gkg(self.datesString)) if self.table == 'mentions': columns = self.mentions_columns if self.coverage is True: # pragma: no cover self.download_list = (urlsv2mentions( v2RangerCoverage(_dateRanger(self.date)))) else: self.download_list = (urlsv2mentions( v2RangerNoCoverage(_dateRanger(self.date)))) ######################### # DEBUG Print Section ######################### # if isinstance(self.datesString,str): # if parse(self.datesString) < datetime.datetime.now(): # self.datesString = (self.datesString[:8]+"234500") # elif isinstance(self.datesString,list): # print("it's a list") # elif isinstance(self.datesString,np.ndarray): # print("it's an array") # else: # print("don't know what it is") # print (self.version,self.download_list,self.date, self.table, self.coverage, self.datesString) # # print (self.download_list) # if self.coverage: # coverage = 'True' # else: # coverage = 'False' # if isinstance(self.date, list): # # formattedDates = ["".join(re.split(' |-|;|:', l)) for l in # self.date] # path = formattedDates # print("gdeltVersion_" + str(self.version) + # "_coverage_" + coverage + "_" + # "_table_" + self.table + '_queryDates_' + # "_".join(path) + # "_queryTime_" + # datetime.datetime.now().strftime('%m-%d-%YT%H%M%S')) # else: # print("gdeltVersion_" + str(self.version) + # "_coverage_" + coverage + "_" + # "_table_" + self.table + '_queryDates_' + # "".join(re.split(' |-|;|:', self.date)) + # "_queryTime_" + # datetime.datetime.now().strftime('%m-%d-%YT%H%M%S')) ######################### # Download section ######################### # print(self.download_list,type(self.download_list)) # from gdelt.extractors import normalpull # e=ProcessPoolExecutor() # if isinstance(self.download_list,list) and len(self.download_list)==1: # from gdelt.extractors import normalpull # # results=normalpull(self.download_list[0],table=self.table) # elif isinstance(self.download_list,list): # print(table) # multilist = list(e.map(normalpull,self.download_list)) # results = pd.concat(multilist) # print(results.head()) if isinstance(self.datesString, str): if self.table == 'events': results = eventWork(self.download_list) else: # if self.table =='gkg': # results = eventWork(self.download_list) # # else: results = _mp_worker(self.download_list, proxies=self.proxies) else: if self.table == 'events': pool = Pool(processes=cpu_count()) downloaded_dfs = list( pool.imap_unordered(eventWork, self.download_list)) else: pool = NoDaemonProcessPool(processes=cpu_count()) downloaded_dfs = list( pool.imap_unordered( _mp_worker, self.download_list, )) pool.close() pool.terminate() pool.join() # print(downloaded_dfs) results = pd.concat(downloaded_dfs) del downloaded_dfs results.reset_index(drop=True, inplace=True) if self.table == 'gkg' and self.version == 1: results.columns = results.ix[0].values.tolist() results.drop([0], inplace=True) columns = results.columns # check for empty dataframe if results is not None: if len(results.columns) == 57: # pragma: no cover results.columns = columns[:-1] else: results.columns = columns # if dataframe is empty, raise error elif results is None or len(results) == 0: # pragma: no cover raise ValueError("This GDELT query returned no data. Check " "query parameters and " "retry") # Add column of human readable codes; need updated CAMEO if self.table == 'events': cameoDescripts = results.EventCode.apply(codeCams) results.insert(27, 'CAMEOCodeDescription', value=cameoDescripts.values) ############################################### # Setting the output options ############################################### # dataframe output if output == 'df': self.final = results # json output elif output == 'json': self.final = results.to_json(orient='records') # csv output elif output == 'csv': self.final = results.to_csv(encoding='utf-8') # geopandas dataframe output elif output == 'gpd' or output == 'geodataframe' or output == 'geoframe': self.final = _geofilter(results) self.final = self.final[self.final.geometry.notnull()] # r dataframe output elif output == 'r': # pragma: no cover if self.coverage: coverage = 'True' else: coverage = 'False' if isinstance(self.date, list): formattedDates = [ "".join(re.split(' |-|;|:', l)) for l in self.date ] path = formattedDates outPath = ( "gdeltVersion_" + str(self.version) + "_coverage_" + coverage + "_" + "_table_" + self.table + '_queryDates_' + "_".join(path) + "_queryTime_" + datetime.datetime.now().strftime('%m-%d-%YT%H%M%S') + ".feather") else: outPath = ( "gdeltVersion_" + str(self.version) + "_coverage_" + coverage + "_" + "_table_" + self.table + '_queryDates_' + "".join(re.split(' |-|;|:', self.date)) + "_queryTime_" + datetime.datetime.now().strftime('%m-%d-%YT%H%M%S') + ".feather") if normcols: results.columns = list( map(lambda x: (x.replace('_', "")).lower(), results.columns)) feather.api.write_dataframe(results, outPath) return results else: self.final = results ######################### # Return the result ######################### # normalized columns if normcols: self.final.columns = list( map(lambda x: (x.replace('_', "")).lower(), self.final.columns)) return self.final
pool = multiprocessing.pool.Pool(processes=multiprocessing.cpu_count()) processes = [] if args.decol: processes.append(pool.apply_async(execute, ("decol", sources))) if args.haruno: processes.append(pool.apply_async(execute, ("haruno", sources))) if args.opti: processes.append(pool.apply_async(execute, ("opti", sources))) if args.tokumin: processes.append(pool.apply_async(execute, ("tokumin", sources))) pool.close() pool.join() for process in processes: process.get() del processes, pool elif args.shared: if os.path.isfile(sources / "Kaisei-Shared.glyphs"): main(("glyphs2ufo", str(sources / "Kaisei-Shared.glyphs"), "-m", str(sources / "ufo_shared"))) else: print( "Cannot locate the 'shared' Glyphs file. Please confirm the file is unzipped." ) else: print("No fonts selected for export")
def _run_next_virtual_nodes(graph, node, globals_, locals_, flags, pool, result): operator = graph.node[node].get('OPERATOR', None) return_value = [] not_safe_to_iter = False is_head_result = True head_result = None # "Hello, world" or {...} if isinstance(result, (basestring, dict)) or not __isiter(result): not_safe_to_iter = True # [[1]] if isinstance(result, list) and len(result) == 1 and isinstance( result[0], list): result = result[0] not_safe_to_iter = True # More nodes ahead? if operator: if not_safe_to_iter: logging.debug('not_safe_to_iter is True for %s' % result) head_result = result tmp_globals = copy.copy(globals_) tmp_locals = copy.copy(locals_) tmp_globals['_'] = tmp_locals['_'] = head_result return_value = __resolve_and_merge_results( _run(graph, node, tmp_globals, tmp_locals, {}, None, True)) else: # Originally this was implemented using result[0] and result[1:] but xrange() is not slice-able, thus, I have changed it to `for` with buffer for 1st result for res_value in result: logging.debug('Now at %s from %s' % (res_value, result)) if is_head_result: logging.debug('is_head_result is True for %s' % res_value) is_head_result = False head_result = res_value tmp_globals = copy.copy(globals_) tmp_locals = copy.copy(locals_) tmp_globals['_'] = tmp_locals['_'] = head_result return_value.insert( 0, _run(graph, node, tmp_globals, tmp_locals, {}, None, True)) continue tmp_globals = copy.copy(globals_) tmp_locals = copy.copy(locals_) tmp_globals['_'] = tmp_locals['_'] = res_value # Synchronous if operator == '|': return_value.append( pool.apply(_run, args=(graph, node, tmp_globals, tmp_locals, {}, None, True))) # Asynchronous if operator == '->': return_value.append( pool.apply_async(_run, args=(graph, node, tmp_globals, tmp_locals, {}, None, True))) pool.close() pool.join() pool.terminate() logging.debug('return_value = %s' % return_value) return_value = __resolve_and_merge_results(return_value) # Loopback else: # AS IS if not_safe_to_iter: return_value = [result] # Iterate for all possible *return values* else: for res_value in result: return_value.append(res_value) # Unbox if len(return_value) == 1: return_value = return_value[0] return return_value
def findDEMFeature(original_dem, index): global featureList,maxArea,xy,areaList,indexList,maskBB,maskHeight,maskLabel,maskStd,\ neighbours,regionbb,mask,regionval,smallerThan,kernel height, width = original_dem.shape region = regionprops(index, original_dem, cache=True) number_regions = len(region) for i in range(0, number_regions): if region[i].area > 10000: areaList.append(region[i].area) indexList.append(i) maskBB.append(region[i].bbox) maskLabel.append(region[i].label) maskHeight.append(region[i].mean_intensity) xy = region[i].coords std = np.std(original_dem[xy[:, 0], xy[:, 1]]) maskStd.append(std) areaList = np.array(areaList) indexList = np.array(indexList) maskBB = np.array(maskBB) maskHeight = np.array(maskHeight) maskLabel = np.array(maskLabel) maskStd = np.array(maskStd) order = np.argsort(-areaList) #minus for decending areaList = areaList[order] indexList = indexList[order] maskBB = maskBB[order] maskHeight = maskHeight[order] maskLabel = maskLabel[order] maskStd = maskStd[order] for regionIndex in range(0, int(len(areaList) / 10)): minr, minc, maxr, maxc = maskBB[regionIndex] extraMargin = 20 if minr - extraMargin < 0: minr = 0 else: minr = minr - extraMargin if minc - extraMargin < 0: minc = 0 else: minc = minc - extraMargin if maxr + extraMargin > height: maxr = height else: maxr = maxr + extraMargin if maxc + extraMargin > width: maxc = width else: maxc = maxc + extraMargin regionbb = index[minr:maxr, minc:maxc] mask = (regionbb == maskLabel[regionIndex]).astype(np.uint8) * 255 contours = cv2.findContours(mask, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)[1] holeData = [] if len(contours) - 1 > 0: for j in range(0, len(contours) - 1): cnt = contours[j + 1] pos = cnt[0] area = cv2.contourArea(cnt) if area > 1000: holeData.append(cv2.contourArea(contours[j + 1])) if len(holeData) > 0: number_holes = len(holeData) holeData = np.sort(holeData) avgHole = np.mean(holeData, dtype=np.int) else: number_holes = avgHole = largestHole = 0 else: number_holes = avgHole = largestHole = 0 cnt = contours[0] hull = cv2.convexHull(cnt, returnPoints=False) defects = cv2.convexityDefects(cnt, hull) defectData = [] if defects is not None: total_number_defects = len(defects) for i in range(defects.shape[0]): d = defects[i, 0][3] if d > 100000: defectData.append(d) if len(defectData) > 0: number_defects = len(defectData) defectData = np.sort(defectData) avgDefect = np.mean(defectData, dtype=np.int) else: number_defects = avgDefect = 0 else: number_defects = avgDefect = 0 total_number_defects = 0 mask2 = cv2.dilate(mask, kernel, iterations=1) > 0 regionbb = np.multiply(mask2, regionbb) neighbours = np.unique(regionbb) if neighbours[0] == 0: neighbours = neighbours[1:-1] removePos = np.where(neighbours == maskLabel[regionIndex]) neighbours = np.delete(neighbours, removePos) neighbours = np.intersect1d( neighbours, maskLabel) #to take only large area adjacent segments smallerThan = 0 if maskStd[regionIndex] > 2: regionval = original_dem[minr:maxr, minc:maxc] pool = ThreadPool(int(cpu_count())) pool.map(findStdSmaller, range(0, len(neighbours))) pool.close() pool.join() else: for i in range(0, len(indexList)): for j in range(0, len(neighbours)): if maskLabel[i] == neighbours[j]: if maskHeight[i] > maskHeight[regionIndex] + 2: smallerThan = smallerThan + 1 featureList.append([maskLabel[regionIndex],number_holes,avgHole,\ number_defects,total_number_defects,smallerThan,int(np.ceil(maskStd[regionIndex]))]) return np.array(featureList), region
def eval(source, globals_={}, locals_={}): """Evaluate Pythonect code in the context of globals and locals. Args: source: A string representing a Pythonect code or a networkx.DiGraph() as returned by parse() globals: A dictionary. locals: Any mapping. Returns: The return value is the result of the evaluated code. Raises: SyntaxError: An error occurred parsing the code. """ return_value = None # Meaningful program? if source != "pass": logging.info('Program is meaningful') return_value = [] return_values = [] globals_values = [] locals_values = [] tasks = [] reduces = {} logging.debug('Evaluating %s with globals_ = %s and locals_ %s' % (source, globals_, locals_)) if not isinstance(source, networkx.DiGraph): logging.info('Parsing program...') graph = parse(source) else: logging.info('Program is already parsed! Using source AS IS') graph = source root_nodes = sorted([ node for node, degree in graph.in_degree().items() if degree == 0 ]) if not root_nodes: cycles = networkx.simple_cycles(graph) if cycles: logging.info( 'Found cycles: %s in graph, using nodes() 1st node (i.e. %s) as root node' % (cycles, graph.nodes()[0])) root_nodes = [graph.nodes()[0]] logging.info('There are %d root node(s)' % len(root_nodes)) logging.debug('Root node(s) are: %s' % root_nodes) # Extend Python's __builtin__ with Pythonect's `lang` start_globals_ = __extend_builtins(globals_) logging.debug('Initial globals_:\n%s' % pprint.pformat(start_globals_)) # Default input start_globals_['_'] = start_globals_.get('_', locals_.get('_', None)) logging.info('_ equal %s', start_globals_['_']) # Execute Pythonect program pool = __create_pool(globals_, locals_) # N-1 for root_node in root_nodes[1:]: if globals_.get('__IN_EVAL__', None) is None and not _is_referencing_underscore( graph, root_node): # Reset '_' globals_['_'] = locals_['_'] = None if globals_.get('__IN_EVAL__', None) is None: globals_['__IN_EVAL__'] = True temp_globals_ = copy.copy(globals_) temp_locals_ = copy.copy(locals_) task_result = pool.apply_async(_run, args=(graph, root_node, temp_globals_, temp_locals_, {}, None, False)) tasks.append((task_result, temp_locals_, temp_globals_)) # 1 if globals_.get('__IN_EVAL__', None) is None and not _is_referencing_underscore( graph, root_nodes[0]): # Reset '_' globals_['_'] = locals_['_'] = None if globals_.get('__IN_EVAL__', None) is None: globals_['__IN_EVAL__'] = True result = _run(graph, root_nodes[0], globals_, locals_, {}, None, False) # 1 for expr_return_value in result: globals_values.append(globals_) locals_values.append(locals_) return_values.append([expr_return_value]) # N-1 for (task_result, task_locals_, task_globals_) in tasks: return_values.append(task_result.get()) locals_values.append(task_locals_) globals_values.append(task_globals_) # Reduce + _PythonectResult Grouping for item in return_values: # Is there _PythonectResult in item list? for sub_item in item: if isinstance(sub_item, _PythonectResult): # 1st Time? if sub_item.values['node'] not in reduces: reduces[sub_item.values['node']] = [] # Add Place holder to mark the position in the return value list return_value.append( _PythonectLazyRunner(sub_item.values['node'])) reduces[sub_item.values['node']] = reduces[ sub_item.values['node']] + [sub_item.values] else: return_value.append(sub_item) # Any _PythonectLazyRunner's? if reduces: for return_item_idx in xrange(0, len(return_value)): if isinstance(return_value[return_item_idx], _PythonectLazyRunner): # Swap list[X] with list[X.go(reduces)] return_value[return_item_idx] = pool.apply_async( return_value[return_item_idx].go, args=(graph, reduces)) return_value = __resolve_and_merge_results(return_value) # [...] ? if return_value: # Single return value? (e.g. [1]) if len(return_value) == 1: return_value = return_value[0] # Update globals_ and locals_ # globals_, locals_ = __merge_all_globals_and_locals(globals_, locals_, globals_values, {}, locals_values, {}) # Set `return value` as `_` globals_['_'] = locals_['_'] = return_value if globals_.get('__IN_EVAL__', None) is not None: del globals_['__IN_EVAL__'] pool.close() pool.join() pool.terminate() return return_value
def processFile(filename): ex = detectExtension(filename) print("\nStarted processing ", filename[:-len(ex)], " ...") workImage = Image.open(("images/" + filename)) width, height = workImage.size preview = Image.new('RGB', (width, height), color='red') if (dithering): print("Started dithering for " + filename) for i in range(width): for j in range(height): oldPixel = getPixel([i, j], workImage) newPixel = getClosestColor(oldPixel, colors) setPixel([i, j], workImage, getClosestColor(newPixel, colors)) quant_error = np.array(oldPixel) - np.array(newPixel) if (i < width - 1): setPixel([i + 1, j], workImage, (np.array(getPixel([i + 1, j], workImage)) + (quant_error * (7 / 16))).astype(int)) if (i > 0 and j < height - 1): setPixel([i - 1, j + 1], workImage, (np.array(getPixel([i - 1, j + 1], workImage)) + (quant_error * (3 / 16))).astype(int)) if (j < height - 1): setPixel([i, j + 1], workImage, (np.array(getPixel([i, j + 1], workImage)) + (quant_error * (5 / 16))).astype(int)) if (i < width - 1 and j < height - 1): setPixel([i + 1, j + 1], workImage, (np.array(getPixel([i + 1, j + 1], workImage)) + (quant_error * (1 / 16))).astype(int)) print("Finished dithering for " + filename) inData = [] seg = width // 6 for n in range(6): inData.append([n, [seg * (n), seg * (n + 1)], workImage, height]) if (seg * (n + 1) < width and n == 5): inData.append([n, [seg * (n + 1), width], workImage, height]) print("Started assembling function for " + filename) if __name__ == '__main__': pool = nPool(processThreads) output = pool.map(generateCommandItems, inData) pool.close() pool.join() output.sort() if (isGif): command = '{"function":"set_nbt","tag":"{\\"Items\\":[' items = "" for e in output: items += e[1] command += items + '],gbundle:\\"start\\"}"}' file1 = open( "item_modifiers/" + (''.join([char for char in filename[:-len(ex)] if char != '\\' ])).lower() + ".json", "w") file1.write(command) file1.close() else: items = "" for e in output: items += e[1] command = "give @p bundle{Items:[" + (''.join( [char for char in items[:-1] if char != '\\'])) + "]}" file1 = open( "functions/" + (''.join([char for char in filename[:-len(ex)] if char != '\\' ])).lower() + ".mcfunction", "w") file1.write(command) file1.close() print(filename + " command has been generated and saved! Size: " + str(len(command))) return 1
def __init__(self, directory, image_data_generator, triplet_path, target_size=(256, 256), color_mode='rgb', classes=None, class_mode='categorical', batch_size=32, shuffle=True, seed=None, data_format=None, save_to_dir=None, save_prefix='', save_format='png', follow_links=False): if data_format is None: data_format = K.image_data_format() self.directory = directory self.image_data_generator = image_data_generator self.target_size = tuple(target_size) if color_mode not in {'rgb', 'grayscale'}: raise ValueError('Invalid color mode:', color_mode, '; expected "rgb" or "grayscale".') self.color_mode = color_mode self.data_format = data_format if self.color_mode == 'rgb': if self.data_format == 'channels_last': self.image_shape = self.target_size + (3, ) else: self.image_shape = (3, ) + self.target_size else: if self.data_format == 'channels_last': self.image_shape = self.target_size + (1, ) else: self.image_shape = (1, ) + self.target_size self.classes = classes if class_mode not in { 'categorical', 'binary', 'sparse', 'input', None }: raise ValueError( 'Invalid class_mode:', class_mode, '; expected one of "categorical", ' '"binary", "sparse", "input"' ' or None.') self.class_mode = class_mode self.save_to_dir = save_to_dir self.save_prefix = save_prefix self.save_format = save_format white_list_formats = {'png', 'jpg', 'jpeg', 'bmp', 'ppm'} # first, count the number of samples and classes self.samples = 0 if not classes: classes = [] for subdir in sorted(os.listdir(directory)): if os.path.isdir(os.path.join(directory, subdir)): classes.append(subdir) self.num_class = len(classes) self.class_indices = dict(zip(classes, range(len(classes)))) pool = multiprocessing.pool.ThreadPool() function_partial = partial(_count_valid_files_in_directory, white_list_formats=white_list_formats, follow_links=follow_links) self.samples = sum( pool.map(function_partial, (os.path.join(directory, subdir) for subdir in classes))) print('Found %d images belonging to %d classes.' % (self.samples, self.num_class)) # second, build an index of the images in the different class subfolders results = [] self.filenames = [] self.classes = np.zeros((batch_size, ), dtype='int32') i = 0 for dirpath in (os.path.join(directory, subdir) for subdir in classes): #result = _list_valid_filenames_in_directory(dirpath, white_list_formats, self.class_indices, follow_links, triplet_path) #results.append(result) results.append( pool.apply_async( _list_valid_filenames_in_directory, (dirpath, white_list_formats, self.class_indices, follow_links, triplet_path))) for res in results: classes, filenames = res.get() #self.classes = np.zeros((len(filenames),), dtype='int32') #self.classes[i:i + len(classes)] = classes self.filenames += filenames i += len(classes) pool.close() pool.join() super(DirectoryIterator, self).__init__(self.samples, batch_size, shuffle, seed, triplet_path)
def generate_prediction(self, model, verbose=False): """Implementation of sciunit.Test.generate_prediction.""" efel.reset() self.observation = collections.OrderedDict(sorted(self.observation.items())) global model_name_soma model_name_soma = model.name pool = multiprocessing.Pool(self.npool, maxtasksperchild=1) stimuli_list=self.create_stimuli_list() run_stim_ = functools.partial(self.run_stim, model) traces_results = pool.map(run_stim_, stimuli_list, chunksize=1) #traces_results = traces_result.get() pool.terminate() pool.join() del pool pool2 = multiprocessing.Pool(self.npool, maxtasksperchild=1) features_names, features_list = self.create_features_list(self.observation) analyse_traces_ = functools.partial(self.analyse_traces, stimuli_list, traces_results) feature_results = pool2.map(analyse_traces_, features_list, chunksize=1) #feature_results = feature_result.get() pool2.terminate() pool2.join() del pool2 feature_results_dict={} for i in range (0,len(feature_results)): feature_results_dict.update(feature_results[i]) #concatenate dictionaries if self.specify_data_set != '': specify_data_set = '_' + self.specify_data_set else: specify_data_set = self.specify_data_set if self.base_directory: self.path_results = self.base_directory + 'results/' + 'somaticfeat' + specify_data_set + '/' + model.name + '/' else: self.path_results = model.base_directory + 'results/' + 'somaticfeat' + specify_data_set + '/' try: if not os.path.exists(self.path_results): os.makedirs(self.path_results) except OSError as e: if e.errno != 17: raise pass file_name=self.path_results+'soma_features.p' SomaFeaturesDict={} SomaFeaturesDict['traces_results']=traces_results SomaFeaturesDict['features_names']=features_names SomaFeaturesDict['feature_results_dict']=feature_results_dict SomaFeaturesDict['observation']=self.observation if self.save_all: pickle.dump(SomaFeaturesDict, gzip.GzipFile(file_name, "wb")) plt.close('all') #needed to avoid overlapping of saved images when the test is run on multiple models in a for loop self.create_figs(model, traces_results, features_names, feature_results_dict, self.observation) #prediction = feature_results_dict soma_features={} needed_keys = { 'feature mean', 'feature sd'} for i in range(len(SomaFeaturesDict['features_names'])): feature_name = SomaFeaturesDict['features_names'][i] soma_features[feature_name] = { key:value for key,value in list(feature_results_dict[feature_name].items()) if key in needed_keys } file_name_json = self.path_results + 'somatic_model_features.json' json.dump(soma_features, open(file_name_json, "w"), indent=4) prediction=soma_features efel.reset() return prediction
def compute_descriptor_async(self, data_iter, descr_factory, overwrite=False, procs=None, **kwds): """ Asynchronously compute feature data for multiple data items. :param data_iter: Iterable of data elements to compute features for. These must have UIDs assigned for feature association in return value. :type data_iter: collections.Iterable[smqtk.representation.DataElement] :param descr_factory: Factory instance to produce the wrapping descriptor element instances. :type descr_factory: smqtk.representation.DescriptorElementFactory :param overwrite: Whether or not to force re-computation of a descriptor vectors for the given data even when there exists precomputed vectors in the generated DescriptorElements as generated from the provided factory. This will overwrite the persistently stored vectors if the provided factory produces a DescriptorElement implementation such storage. :type overwrite: bool :param procs: Optional specification of how many processors to use when pooling sub-tasks. If None, we attempt to use all available cores. :type procs: int | None :param pool_type: multiprocessing pool type to use. If no provided, we use a normal multiprocessing.pool.Pool instance. By default we use the ThreadPool type when None. :type pool_type: type | None :return: Mapping of input DataElement instances to the computed descriptor element. DescriptorElement UUID's are congruent with the UUID of the data element it is the descriptor of. :rtype: dict[smqtk.representation.DataElement, smqtk.representation.DescriptorElement] """ self._log.info("Async compute features") # Mapping of DataElement to async processing result #: :type: dict[smqtk.representation.DataElement, multiprocessing.pool.ApplyResult] ar_map = {} # Mapping of DataElement to the DescriptorElement for it. #: :type: dict[smqtk.representation.DataElement, smqtk.representation.DescriptorElement] de_map = {} # Queue up descriptor generation for descriptor elements that procs = procs and int(procs) pool_t = kwds.get("pool_type", multiprocessing.pool.ThreadPool) pool = pool_t(processes=procs) with SimpleTimer("Queuing descriptor computation...", self._log.debug): for d in data_iter: de_map[d] = descr_factory.new_descriptor(self.name, d.uuid()) if overwrite or not de_map[d].has_vector(): ar_map[d] = \ pool.apply_async(_async_feature_generator_helper, args=(self, d)) pool.close() failures = False # noinspection PyPep8Naming perc_T = 0.0 perc_inc = 0.1 with SimpleTimer("Collecting async results...", self._log.debug): for i, (d, ar) in enumerate(ar_map.iteritems()): descriptor = ar.get() if descriptor is None: failures = True continue else: de_map[d].set_vector(descriptor) perc = float(i + 1) / len(ar_map) if perc >= perc_T: self._log.debug("Progress: [%d/%d] %3.3f%%", i + 1, len(ar_map), float(i + 1) / (len(ar_map)) * 100) perc_T += perc_inc pool.join() # Check for failed generation if failures: raise RuntimeError("Failure occurred during data feature " "computation. See logging.") return de_map
def classify_async(self, d_iter, factory, overwrite=False, procs=None, use_multiprocessing=False, ri=None): """ Asynchronously classify the DescriptorElements in the given iterable. :param d_iter: Iterable of DescriptorElements :type d_iter: collections.Iterable[smqtk.representation.DescriptorElement] :param factory: Classifier element factory to use for element generation :type factory: smqtk.representation.ClassificationElementFactory :param overwrite: Recompute classification of the input descriptor and set the results to the ClassificationElement produced by the factory. :type overwrite: bool :param procs: Explicit number of cores/thread/processes to use. :type procs: None | int :param use_multiprocessing: Use ``multiprocessing.pool.Pool`` instead of ``multiprocessing.pool.ThreadPool``. :type use_multiprocessing: bool :param ri: Progress reporting interval in seconds. Set to a value > 0 to enable. Disabled by default. :type ri: float | None :return: Mapping of input DescriptorElement instances to the computed ClassificationElement. ClassificationElement UUID's are congruent with the UUID of the DescriptorElement :rtype: dict[smqtk.representation.DescriptorElement, smqtk.representation.ClassificationElement] """ self._log.info("Async classifying descriptors") ri = ri and ri > 0 and ri # Mapping of DataElement to async processing result ar_map = {} # Mapping of DescriptorElement to its associated ClassificationElement #: :type: dict[smqtk.representation.DescriptorElement, smqtk.representation.ClassificationElement] d2c_map = {} procs = procs and int(procs) if use_multiprocessing: pool = multiprocessing.pool.Pool(procs) else: pool = multiprocessing.pool.ThreadPool(procs) self._log.info("Queueing async work") i = j = 0 s = lt = time.time() for d in d_iter: d2c_map[d] = factory.new_classification(self.name, d.uuid()) i += 1 if overwrite or not d2c_map[d].has_classifications(): ar_map[d] = pool.apply_async(_async_helper_classify, args=(self, d)) j += 1 t = time.time() if ri and t - lt >= ri: self._log.debug( "-- Scanned = %d :: Queued = %d " "(per second = %f)", i, j, i / (t - s)) lt = t # Close pool input pool.close() self._log.info("Collecting results") failures = False s = lt = time.time() for i, (d, ar) in enumerate(ar_map.iteritems()): c = ar.get() if c is None: failures = True continue else: d2c_map[d].set_classification(c) # progress reporting t = time.time() if ri and t - lt >= ri: self._log.debug("-- Complete = %d " "(per second = %f)", i, i / (t - s)) lt = t pool.join() if failures: raise RuntimeError("Failure occurred during descriptor " "classification. See logging.") return d2c_map
def test_program(bits_to_flip): """ Flip the specified bits in the initial code and then execute the code in an emulator Note: we do some crazy stuff with multiprocessing because Unicorn sometimes SIGABRTs and we need to catch it. :param flip_operation: XOR, AND, or OR :param bits_to_flip: the location of the bits to flip :return: """ global code_initial, flip_operation, result_cache, force_invalid_ins logger.debug(flip_operation) code_input = flip_bits(code_initial, bytes_to_trash, bits_to_flip, flip_operation) # print code_input # Is it cached? code_str = int(''.join(map(str, code_input))) if result_cache is not None and code_str in result_cache: return result_cache[code_str] logger.debug(code_input) # convert list to str for emulator system_code = ''.join(map(chr, code_input)) if force_invalid_ins: asm = list(CAPSTONE.disasm(system_code, len(code_input))) # logger.info("compiled: {}".format(system_code.encode('hex'))) if len(asm) == 0: logger.warning('>>> \tdisasm failure'.format(code.encode('hex'))) for ins in asm: # print repr(ins.bytes), len(ins.bytes) # logger.info( # '>>> {}\t {} {}'.format(binascii.hexlify(ins.bytes), # ins.mnemonic, # ins.op_str)) if ins.bytes == "\x00" * len(ins.bytes) \ or ins.bytes == "\xff" * len(ins.bytes): logger.debug("Forcing invalid instruction") return Result.GLITCH_FAILED_INVALID_INSTRUCTION # Create a 1 process pool to execute our emulator in (effectively a sandbox for SIGABRT) pool = MyPool(processes=1) # Run the emulator t = pool.apply_async(run_emulator, (system_code, )) # Get the result try: rtn = t.get(timeout=1) except: # Sometimes Unicorn will SIGABRT, we need to catch that logger.exception("Got a really bad fail!!!") rtn = Result.GLITCH_FAILED_SIGABRT del t # sys.exit(0) # Make sure we don't keep making pools pool.close() pool.terminate() pool.join() del pool if result_cache is not None: result_cache[code_str] = rtn return rtn
def baseline_batch(): flags = get_arguments() # Read script arguments from run.config file. num_processors, num_runs, seeds = config_parser.parse_run_params( print_params=False) if len(seeds) != num_runs: raise configparser.Error( 'Number of seeds in run.config `seeds`' ' must match the number of runs (`num_runs`) argument.') print('Arguments (baseline.py):') print('-----------------------') print('Number of runs: {0}'.format(num_runs)) print('Number of processors: {0}'.format(num_processors)) print('Train seeds: {0}\n'.format(seeds)) # Assess total number of processors. processors_total = mp.cpu_count() print(f'Total number of processors available: {processors_total}\n') # Adjust number of processors. if num_processors > processors_total: num_processors = processors_total print(f'Number of processors downgraded to {num_processors}\n') # Read train.py arguments from train.config file. baseline_config = configparser.ConfigParser() baseline_path = CONFIG_PATH / 'train.config' baseline_config.read(str(baseline_path)) # Setup sumo-tls-type. baseline_config.set('train_args', 'tls_type', flags.tls_type) baseline_config.set('train_args', 'experiment_save_agent', str(False)) # Override train configurations with test parameters. test_config = configparser.ConfigParser() test_path = CONFIG_PATH / 'test.config' test_config.read(test_path.as_posix()) horizon = int(test_config.get('test_args', 'rollout-time')) baseline_config.set('train_args', 'experiment_time', str(horizon)) # Write .xml files for test plots creation. baseline_config.set('train_args', 'sumo_emission', str(True)) timestamp = datetime.now().strftime('%Y%m%d%H%M%S.%f') print(f'Experiment timestamp: {timestamp}') with tempfile.TemporaryDirectory() as tmp_dir: # Create a config file for each train.py # with the respective seed. These config # files are stored in a temporary directory. tmp_path = Path(tmp_dir) baseline_configs = [] for seed in seeds: cfg_path = tmp_path / f'{flags.tls_type}-{seed}.config' baseline_configs.append(cfg_path) # Setup train seed. baseline_config.set("train_args", "experiment_seed", str(seed + 1)) # Write temporary train config file. with cfg_path.open('w') as ft: baseline_config.write(ft) # rvs: directories' names holding experiment data if num_processors > 1: packed_args = [(delay, cfg) for ( delay, cfg) in zip(range(len(baseline_configs)), baseline_configs)] pool = NonDaemonicPool(num_processors) rvs = pool.map(delay_baseline, packed_args) pool.close() pool.join() else: rvs = [] for cfg in baseline_configs: rvs.append(delay_baseline((0.0, cfg))) # Create a directory and move newly created files paths = [Path(f) for f in rvs] commons = [p.parent for p in paths] if len(set(commons)) > 1: raise ValueError( f'Directories {set(commons)} must have the same root') dirpath = commons[0] batchpath = dirpath / timestamp if not batchpath.exists(): batchpath.mkdir() # Move files for src in paths: dst = batchpath / src.parts[-1] src.replace(dst) sys.stdout.write(str(batchpath)) return str(batchpath)