Beispiel #1
0
def _map_parallel(function, args, n_jobs):
    """multiprocessing.Pool(processors=n_jobs).map with some error checking"""
    # Following the error checking found in joblib
    multiprocessing = int(os.environ.get('JOBLIB_MULTIPROCESSING', 1)) or None
    if multiprocessing:
        try:
            import multiprocessing
            import multiprocessing.pool
        except ImportError:
            multiprocessing = None
    # 2nd stage: validate that locking is available on the system and
    #            issue a warning if not
    if multiprocessing:
        try:
            _sem = multiprocessing.Semaphore()
            del _sem  # cleanup
        except (ImportError, OSError) as e:
            multiprocessing = None
            warnings.warn('%s. _map_parallel will operate in serial mode' % (e,))
    if multiprocessing and int(n_jobs) not in (0, 1):
        if n_jobs == -1:
            n_jobs = None
        pool = multiprocessing.Pool(processes=n_jobs)
        map_result = pool.map(function, args)
        pool.close()
        pool.join()
    else:
        map_result = list(map(function, args))
    return map_result
Beispiel #2
0
def run_trajectory(t, ps, landscape, ptv, num_iterations, num_processors):
    # Get the points in the trajectory
    points = t.points()

    # Determine the index of each unique point (sometimes points are equal due to rounding)
    uinds = [i for i, p in enumerate(points) if i == 0 or not p.equals(points[i - 1])]

    # Create a process pool, using as many processors as are available, or
    # are required to allow each point to run concurrently
    pool = mp.Pool(processes=min(num_processors, len(points)))

    results = []
    for i in uinds:
        # Modify the parameter set to match the current point
        psm = ps.copy()
        psm.modify_for_point(points[i], ptv)
        psm.convert_to_age_classes()

        # Launch a process to run the simulation(s) for the point. This modifies the point in place
        args = [points[i], psm, landscape, num_iterations, num_processors]
        results.append(pool.apply_async(run_iterations_for_point, args))

    pool.close()
    pool.join()

    # Merge the unique and non-unique points back together
    for i, r in zip(uinds, results):
        points[i] = r.get(None)

    # Return a new trajectory containing the results for each point
    return io.Trajectory(points=points)
Beispiel #3
0
def ScopedPool(*args, **kwargs):
  """Context Manager which returns a multiprocessing.pool instance which
  correctly deals with thrown exceptions.

  *args - Arguments to multiprocessing.pool

  Kwargs:
    kind ('threads', 'procs') - The type of underlying coprocess to use.
    **etc - Arguments to multiprocessing.pool
  """
  if kwargs.pop('kind', None) == 'threads':
    pool = multiprocessing.pool.ThreadPool(*args, **kwargs)
  else:
    orig, orig_args = kwargs.get('initializer'), kwargs.get('initargs', ())
    kwargs['initializer'] = _ScopedPool_initer
    kwargs['initargs'] = orig, orig_args
    pool = multiprocessing.pool.Pool(*args, **kwargs)

  try:
    yield pool
    pool.close()
  except:
    pool.terminate()
    raise
  finally:
    pool.join()
Beispiel #4
0
def test_no_thread_pool():
    pool = xmon_stepper.ThreadlessPool()
    result = pool.map(lambda x: x + 1, range(10))
    assert result == [x + 1 for x in range(10)]
    # No ops.
    pool.terminate()
    pool.join()
Beispiel #5
0
def refine(L, X, D, e, a, b, k, num_workers, metric):
    """ Throw out bad points (algorithm 7, lines 7-17)
    :param L: List of subsets
    :param X: Data matrix
    :param D: dictionary
    :param e: lower bound on fractional size of each cluster
    :param a: lower bound on fractional size of a set inside own cluster for which stability holds
    :param b: lower bound on fractional size of a set outside own cluster for which stability holds
    :param k: Number of clusters
    :param num_workers: Number of workers
    :param metric: metric is in {avg, max, min}
    :return: Refined clusters
    """
    print("Getting rid of bad points")
    print("Length of L at start = ", len(L))
    start = time.time()
    n = len(X)
    T = int((e - 2 * a - b * k) * n)
    t = int((e - a) * n)
    with Pool() as pool:
        func = partial(refine_individual, D, T, t)
        L = pool.map(func, L)
        pool.close()
        pool.join()
    end = time.time()
    print("Length of L on end = ", len(L))
    print("time = {0:.2f}s".format(end - start))
    return grow(L, X, a, num_workers, metric)
Beispiel #6
0
def from_carrays(path, format_categories='bcolz', format_codes='bcolz', format_values='bcolz', parallel=True):
    assert os.path.exists(path), 'No path {}'.format(path)
    df_columns = glob.glob(os.path.join(path, '*'))
    df = dict()
    if parallel:
        pool = multiprocessing.pool.ThreadPool()
        results = []
        for i, k in enumerate(df_columns):
            p = pool.apply_async(_from_carray, args=(k,), kwds={'format_categories': format_categories, 'format_codes': format_codes, 'format_values': format_values})
            results.append(p)
        pool.close()
        pool.join()
        for x in results:
            meta, s = x.get()
            df[meta['name']] = s
    else:
        for i, k in enumerate(df_columns):
            meta, s = _from_carray(k, format_categories=format_categories, format_codes=format_codes, format_values=format_values)
            df[meta['name']] = s

    # # # this is slow when we have non categoricals as series for some reason
    with log.timedlogger('constructing dataframe from %s column dict' % len(df)):
        df = pandas.DataFrame(df)  # TODO: fast DataFrame constructor

    return df
Beispiel #7
0
def parallel_compile(self, sources, output_dir=None, macros=None,
                     include_dirs=None, debug=0, extra_preargs=None,
                     extra_postargs=None, depends=None):
    """New compile function that we monkey patch into the existing compiler instance.
    """
    import multiprocessing.pool

    # Copied from the regular compile function
    macros, objects, extra_postargs, pp_opts, build = \
            self._setup_compile(output_dir, macros, include_dirs, sources,
                                depends, extra_postargs)
    cc_args = self._get_cc_args(pp_opts, debug, extra_preargs)

    def _single_compile(obj):
        try:
            src, ext = build[obj]
        except KeyError:
            return
        self._compile(obj, src, ext, cc_args, extra_postargs, pp_opts)

    # Set by fix_compiler
    global glob_use_njobs
    if glob_use_njobs == 1:
        # This is equivalent to regular compile function
        for obj in objects:
            _single_compile(obj)
    else:
        # Use ThreadPool, rather than Pool, since the objects are picklable.
        pool = multiprocessing.pool.ThreadPool(glob_use_njobs)
        pool.map(_single_compile, objects)
        pool.close()
        pool.join()

    # Return *all* object filenames, not just the ones we just built.
    return objects
def pass_data_to_search(symbol,path,start_time_seconds,end_time_seconds,date,time_interval,tt,code_path):

    jobs=[]
    dic_files={}
    lis=[]
    slot_results=[]
    
    file_name = path+'b'+date+'.l.bz2'
    # file_name = path + date+'/'+dic_files[lis[index]]+'.bz2'
        
    size=os.path.getsize(file_name)
    total_rows=size/69
    total_processes1=40
    slots=total_rows/total_processes1

    #Multiprocessing each file as chunk
    # mapper(0,slots,total_processes1,symbol,start_time_seconds,end_time_seconds,date,time_interval,file_name,tt,code_path)
    # mapper(1,slots,total_processes1,symbol,start_time_seconds,end_time_seconds,date,time_interval,file_name,tt,code_path)
    
    pool = multiprocessing.Pool(total_processes1)
    

    for i in range(total_processes1):

        pool.apply_async(mapper, args = (i,slots,total_processes1,symbol,start_time_seconds,end_time_seconds,date,time_interval,file_name,tt,code_path))
        
    pool.close()
    pool.join()    
Beispiel #9
0
 def test():
     print("Creating 5 (non-daemon) workers and jobs in main process.")
     pool = MyPool(5)
     result = pool.map(work, [randint(1, 5) for x in range(5)])
     pool.close()
     pool.join()
     print(result)
Beispiel #10
0
def _map_parallel(function, args, n_jobs):
    """multiprocessing.Pool(processors=n_jobs).map with some error checking"""
    # Following the error checking found in joblib
    multiprocessing = int(os.environ.get('JOBLIB_MULTIPROCESSING', 1)) or None
    if multiprocessing:
        try:
            import multiprocessing
            import multiprocessing.pool
        except ImportError:
            multiprocessing = None
        if sys.platform.startswith("win") and PY2:
            msg = "Multiprocessing is not supported on Windows with Python 2.X. Setting n_jobs=1"
            logger.warning(msg)
            n_jobs = 1
    # 2nd stage: validate that locking is available on the system and
    #            issue a warning if not
    if multiprocessing:
        try:
            _sem = multiprocessing.Semaphore()
            del _sem  # cleanup
        except (ImportError, OSError) as e:
            multiprocessing = None
            logger.warning('{}. _map_parallel will operate in serial mode'.format(e))
    if multiprocessing and int(n_jobs) not in (0, 1):
        if n_jobs == -1:
            n_jobs = None
        try:
            pool = multiprocessing.Pool(processes=n_jobs)
            map_result = pool.map(function, args)
        finally:
            pool.close()
            pool.join()
    else:
        map_result = list(map(function, args))
    return map_result
def main():
    if len(sys.argv) < 3:
        print("Syntax:")
        print(
            "  {} [min_yeses] [out_csv_file]".format(
                sys.argv[0]
            )
        )
        sys.exit(1)

    min_yeses = eval(sys.argv[1])
    out_csv_file = sys.argv[2]

    pconfig = config.PaperworkConfig()
    pconfig.read()

    src_dir = pconfig.settings['workdir'].value
    print("Source work directory : {}".format(src_dir))
    src_dsearch = docsearch.DocSearch(src_dir)
    src_dsearch.reload_index()

    nb_threads = multiprocessing.cpu_count()
    pool = multiprocessing.pool.ThreadPool(processes=nb_threads)

    with open(out_csv_file, 'a', newline='') as csvfile:
        csvwriter = csv.writer(csvfile)
        for min_yes in min_yeses:
            pool.apply_async(
                _run_simulation,
                (src_dsearch, min_yes, csvwriter,)
            )
        pool.close()
        pool.join()
    print("All done !")
    def _listArtifacts(self, urls, gavs):
        """
        Loads maven artifacts from list of GAVs and tries to locate the artifacts in one of the
        specified repositories.

        :param urls: repository URLs where the given GAVs can be located
        :param gavs: List of GAVs
        :returns: Dictionary where index is MavenArtifact object and value is it's repo root URL.
        """
        def findArtifact(gav, urls, artifacts):
            artifact = MavenArtifact.createFromGAV(gav)
            for url in urls:
                if maven_repo_util.gavExists(url, artifact):
                    #Critical section?
                    artifacts[artifact] = ArtifactSpec(url, [ArtifactType(artifact.artifactType, True, set(['']))])
                    return

            logging.warning('Artifact %s not found in any url!', artifact)

        artifacts = {}
        pool = ThreadPool(maven_repo_util.MAX_THREADS)
        for gav in gavs:
            pool.apply_async(findArtifact, [gav, urls, artifacts])

        # Close the pool and wait for the workers to finnish
        pool.close()
        pool.join()

        return artifacts
Beispiel #13
0
def slippy_test(test_options, width=TILE_WIDTH, height=TILE_HEIGHT, tile_factor=TILE_FACTOR):
	#assume each screen is a 10x5 grid of tiles
	#this approximately the OTM map size at full screen
	#at my desk
	z = test_options['z']
	x = test_options['x']
	y = test_options['y']
	url_prefix = test_options['url_prefix']


	tiles_to_request = []
	for x_iter in range(x - width/2, x + width/2 - 1):
		for y_iter in range(y - height/2, y + height/2 - 1):
			tiles_to_request.append(url_prefix + '%d/%d/%d.png' % (z, x_iter, y_iter))

	pool = multiprocessing.Pool(processes=tile_factor)
	start_time = time.time()
	results = pool.map(slippy_test_helper, tiles_to_request)
	end_time = time.time()
	pool.close()
	pool.join()
	sys.stderr.write('.')

	if(False in results):
		return '%d,ERROR,%f' % (-1, float('nan'))
	return '%d,OK,' % z + str(end_time - start_time)
Beispiel #14
0
def threshold(X, e, a, b, k, num_workers, metric):
    """ Get all threshold clusters (algorithm 7, lines 1-6)
    :param X: Data matrix
    :param e: lower bound on fractional size of each cluster
    :param a: lower bound on fractional size of a set inside own cluster for which stability holds
    :param b: lower bound on fractional size of a set outside own cluster for which stability holds
    :param k: Number of clusters
    :param num_workers: Number of workers
    :param metric: metric is in the set {avg, min, max}
    :return: Threshold clusters
    """
    print("Populating list with all threshold clusters with metric:", metric)
    start = time.time()
    n = len(X)
    minsize = int(e * n)
    with Pool(num_workers) as pool:
        func = partial(get_thresholds, X, minsize, num_workers, metric)
        items = pool.map(func, range(n))
        pool.close()
        pool.join()
    threshold_lists = [item[0] for item in items]
    L = [item for sublist in threshold_lists for item in sublist]
    D = dict([(item[1], item[2]) for item in items])
    end = time.time()
    print("Length of L = ", len(L))
    print("time = {0:.2f}s".format(end - start))
    return refine(L, X, D, e, a, b, k, num_workers, metric)
Beispiel #15
0
    def count_intersect(self, threshold, frequency=True):

        self.counts = OrderedDict()
        self.rlen, self.qlen = {}, {}
        self.nalist = []

        if frequency:
            self.frequency = OrderedDict()

        # if self.mode_count == "bp":
        #    print2(self.parameter, "\n{0}\t{1}\t{2}\t{3}\t{4}".format("Reference","Length(bp)", "Query", "Length(bp)", "Length of Intersection(bp)"))
        # elif self.mode_count == "count":
        #    print2(self.parameter, "\n{0}\t{1}\t{2}\t{3}\t{4}".format("Reference","sequence_number", "Query", "sequence_number", "Number of Intersection"))

        for ty in self.groupedreference.keys():
            self.counts[ty] = OrderedDict()
            self.rlen[ty], self.qlen[ty] = OrderedDict(), OrderedDict()
            if frequency:
                self.frequency[ty] = OrderedDict()

            for r in self.groupedreference[ty]:
                if r.total_coverage() == 0 and len(r) > 0:
                    self.nalist.append(r.name)
                    continue
                else:
                    self.counts[ty][r.name] = OrderedDict()
                    if self.mode_count == "bp":
                        rlen = r.total_coverage()
                    elif self.mode_count == "count":
                        rlen = len(r)
                    self.rlen[ty][r.name] = rlen

                    mp_input = []
                    for q in self.groupedquery[ty]:
                        if r.name == q.name:
                            continue
                        else:
                            mp_input.append([q, self.nalist, self.mode_count, self.qlen, threshold,
                                             self.counts, frequency, self.frequency, ty, r])
                    # q, nalist, mode_count, qlen_dict, threshold, counts, frequency, self_frequency, ty, r
                    pool = multiprocessing.Pool(processes=multiprocessing.cpu_count() - 1)
                    mp_output = pool.map(mp_count_intersect, mp_input)
                    pool.close()
                    pool.join()

                    # qname, nalist, qlen_dict[ty][q.name], counts[ty][r.name][q.name], self_frequency[ty][q.name].append(c[2])
                    for output in mp_output:
                        if output[1]:
                            self.nalist.append(output[1])
                        else:
                            self.qlen[ty][output[0]] = output[2]
                            self.counts[ty][r.name][output[0]] = output[3]
                            # print(r.name)
                            # print(output[0])
                            # print(output[3])
                            try:
                                self.frequency[ty][output[0]][r.name] = output[3][2]
                            except:
                                self.frequency[ty][output[0]] = {}
                                self.frequency[ty][output[0]][r.name] = output[3][2]
Beispiel #16
0
def _CompileDeps(aapt_path, dep_subdirs, temp_dir):
  partials_dir = os.path.join(temp_dir, 'partials')
  build_utils.MakeDirectory(partials_dir)
  partial_compile_command = [
      aapt_path + '2',
      'compile',
      # TODO(wnwen): Turn this on once aapt2 forces 9-patch to be crunched.
      # '--no-crunch',
  ]
  pool = multiprocessing.pool.ThreadPool(10)
  def compile_partial(directory):
    dirname = os.path.basename(directory)
    partial_path = os.path.join(partials_dir, dirname + '.zip')
    compile_command = (partial_compile_command +
                       ['--dir', directory, '-o', partial_path])
    build_utils.CheckOutput(compile_command)

    # Sorting the files in the partial ensures deterministic output from the
    # aapt2 link step which uses order of files in the partial.
    sorted_partial_path = os.path.join(partials_dir, dirname + '.sorted.zip')
    _SortZip(partial_path, sorted_partial_path)

    return sorted_partial_path

  partials = pool.map(compile_partial, dep_subdirs)
  pool.close()
  pool.join()
  return partials
Beispiel #17
0
def process_iteration(Ns, ps, landscape, config):
	output_dir = config.output_dir + config.ext
	
	if config.background_image != None:
		background_path = config.input_dir + "/" + config.background_image
	else:
		background_path = None
	
	#Create a point to hold the iteration
	p = Point()
	p.add_iteration()
	
	#draw_population(Ns[0], landscape, ps.totalK, 0, output_dir, 2.0, background_path)
	
	if config.display:
		pool = mp.Pool(config.num_processors)

	for t in xrange(min(ps.max_time_steps, len(Ns))):
		if config.display:
			pool.apply_async(draw_population, [Ns[t], landscape, ps.totalK, t, output_dir, 2.0, background_path])
		
		p.add_time_step([t] + population_statistics(ps, landscape, Ns[t]))
	
	pool.close()

	#Write the iteration results to file as a trajectory containing a single point
	write_trajectories([Trajectory(points=[p])], None, ps.sentinels, output_dir + "/results.txt")

	if config.save_time_steps:
		np.savez(output_dir + "/populations.npz", *Ns)

	pool.join()
    def run_abstraction_parallel(self):
        # initialization
        self.__get_methods()
        self.__read_config()
        self.__get_dataset()

        # get filename and properties
        filename_properties = []
        for filename, properties in self.files.iteritems():
            filename_properties.append((filename, properties))

        # run experiment in multiprocessing mode
        total_cpu = multiprocessing.cpu_count()
        pool = NoDaemonProcessPool(processes=total_cpu)
        results = pool.map(self, filename_properties)
        pool.close()
        pool.join()

        # open evaluation file
        self.__check_path(self.files['evaluation_directory'])
        f = open(self.files['evaluation_file'], 'wt')
        writer = csv.writer(f)

        # set header for evaluation file
        header = []
        if self.configuration['main']['abstraction'] == '1':
            header = self.configuration['abstraction_evaluation']['evaluation_file_header'].split('\n')
        writer.writerow(tuple(header))

        # write experiment result
        for result in results:
            writer.writerow(result)

        # close evaluation file
        f.close()
Beispiel #19
0
def download_junit(db, threads, client_class):
    """Download junit results for builds without them."""
    builds_to_grab = db.get_builds_missing_junit()
    pool = None
    if threads > 1:
        pool = multiprocessing.pool.ThreadPool(
            threads, mp_init_worker, ('', {}, client_class, False))
        test_iterator = pool.imap_unordered(
            get_junits, builds_to_grab)
    else:
        global WORKER_CLIENT  # pylint: disable=global-statement
        WORKER_CLIENT = client_class('', {})
        test_iterator = (
            get_junits(build_path) for build_path in builds_to_grab)
    for n, (build_id, build_path, junits) in enumerate(test_iterator, 1):
        print('%d/%d' % (n, len(builds_to_grab)),
              build_path, len(junits), len(''.join(junits.values())))
        junits = {k: remove_system_out(v) for k, v in junits.iteritems()}

        db.insert_build_junits(build_id, junits)
        if n % 100 == 0:
            db.commit()
    db.commit()
    if pool:
        pool.close()
        pool.join()
Beispiel #20
0
 def wrapper(*args, **kwargs):
     try:
         return f(*args, **kwargs)
     except Exception as e:
         import traceback
         print(traceback.format_exc())
         pool.join()
         sys.exit()
def create_process_pool(index):
    print index
    li = range(3)
    pool = multiprocessing.Pool(processes = len(li))
    for sub_index in li:
        pool.apply_async(print_process_index, (index, sub_index))
    pool.close()
    pool.join()
Beispiel #22
0
    def work(num_procs):
        print("Creating %i (daemon) workers and jobs in child." % num_procs)
        pool = multiprocessing.Pool(num_procs)

        result = pool.map(sleepwhile,
                          [randint(1, 5) for x in range(num_procs)])
        pool.close()
        pool.join()
        return result
Beispiel #23
0
def laminar(L, X, e, a, b, num_workers, metric):
    """ Make family laminar (Algorithm 9)
    :param L: List of subsets
    :param X: The data set
    :param e: lower bound on the fractional size of every cluster
    :param a: lower bound on the fractional size of every set in own cluster for which stability holds
    :param b: lower bound on the fractional size of every set in outside cluster for which stability holds
    :param num_workers: number of workers
    :param metric: metric is in {avg, max, min}
    :return: Laminar list
    """
    print("Making the list laminar (parallel)")
    start = time.time()
    n = len(X)
    print("Computing pairs of non-laminar sets")
    with Pool(num_workers) as pool:
        func = partial(non_laminar, L)
        intersections = pool.map(func, range(len(L) - 1))
        pool.close()
        pool.join()
    intersections = [item for sub_list in intersections for item in sub_list]
    end = time.time()
    fname = "intersections_" + metric + ".pkl.gz"
    # with gzip.open(fname, 'wb') as f:
    #    pickle.dump(intersections, f)
    print("Length of intersections = ", len(intersections))
    print("time = {0:0.2f}s".format(end - start))
    print("Removing non-laminar pairs")
    start = time.time()
    manager = Manager()
    shared_L = manager.list(L)
    n = len(intersections)
    j = 0
    batch = int(n / num_workers)
    rem = n % num_workers
    jobs = []
    for i in range(num_workers):
        process = Process(
            target=iterate_laminar, args=(shared_L, X, e, a, b, num_workers, metric, intersections[j : j + batch])
        )
        process.start()
        jobs.append(process)
        j += batch
    if rem:
        process = Process(
            target=iterate_laminar, args=(shared_L, X, e, a, b, num_workers, metric, intersections[j : j + rem])
        )
        process.start()
        jobs.append(process)
    for p in jobs:
        p.join()
    L = [item for item in shared_L if item is not None]
    end = time.time()
    print("Length of list after removing non-laminar pairs = ", len(L))
    print("time = {0:.2f}s".format(end - start))
    return L
Beispiel #24
0
def update_all(opts):
    """Updates all menus"""
    pool = NoDaemonPool(processes=5)
    pool.apply_async(update_applications, (opts,))
    pool.apply_async(update_bookmarks, (opts,))
    pool.apply_async(update_recent_files, (opts,))
    pool.apply_async(update_devices, (opts,))
    pool.apply_async(update_rootmenu, (opts,))
    pool.close()
    pool.join()
Beispiel #25
0
def test():
    print("Creating 5 (non-daemon) workers and jobs in main process.")

    year = [x for x in range(2008, 2014)]

    pool = CustomPool(len(year)*4)

    result = pool.map(work,year)

    pool.close()
    pool.join()
Beispiel #26
0
def work(num_procs):
    print "Creating %i (daemon) workers and jobs in child." % num_procs
    pool = multiprocessing.Pool(num_procs)

    result = pool.map(sleepawhile, [randint(1, 5) for x in range(num_procs)])

    # The following is not really needed, since the (daemon) workers of the
    # child's pool are killed when the child is terminated, but it's good
    # practice to cleanup after ourselves anyway.
    pool.close()
    pool.join()
    return result
Beispiel #27
0
def _ConvertToWebP(webp_binary, png_files):
  pool = multiprocessing.pool.ThreadPool(10)
  def convert_image(png_path):
    root = os.path.splitext(png_path)[0]
    webp_path = root + '.webp'
    args = [webp_binary, png_path] + _PNG_TO_WEBP_ARGS + [webp_path]
    subprocess.check_call(args)
    os.remove(png_path)
  # Android requires pngs for 9-patch images.
  pool.map(convert_image, [f for f in png_files if not f.endswith('.9.png')])
  pool.close()
  pool.join()
Beispiel #28
0
def parse_sam_in_threads(remap_csv, nthreads):
    """ Call parse_sam() in multiple processes.

    Launch a multiprocessing pool, walk through the iterator, and then be sure
    to close the pool at the end.
    """
    pool = Pool(processes=nthreads)
    try:
        reads = pool.imap(parse_sam, iterable=matchmaker(remap_csv), chunksize=100)
        for read in reads:
            yield read
    finally:
        pool.close()
        pool.join()
def _ConvertToWebP(webp_binary, png_files):
  pool = multiprocessing.pool.ThreadPool(10)
  def convert_image(png_path):
    root = os.path.splitext(png_path)[0]
    webp_path = root + '.webp'
    args = [webp_binary, png_path, '-mt', '-quiet', '-m', '6', '-q', '100',
        '-lossless', '-o', webp_path]
    subprocess.check_call(args)
    os.remove(png_path)

  pool.map(convert_image, [f for f in png_files
                           if not _PNG_WEBP_BLACKLIST_PATTERN.match(f)])
  pool.close()
  pool.join()
    def buildList(self):
        """
        Build the artifact "list" from sources defined in the given configuration.

        :returns: Dictionary described above.
        """
        priority = 0
        pool_dict = {}

        for source in self.configuration.artifactSources:
            priority += 1
            pool = pool_dict.setdefault(source['type'], ThreadPool(self.MAX_THREADS_DICT[source['type']]))
            pool.apply_async(self._read_artifact_source, args=[source, priority],
                             callback=self._add_result)

        for pool in pool_dict.values():
            pool.close()

        at_least_1_runs = True
        all_keys = range(1, len(self.configuration.artifactSources) + 1)
        finished = False
        while at_least_1_runs:
            for i in range(30):
                time.sleep(1)

                if not self.errors.empty():
                    for pool in pool_dict.values():
                        logging.debug("Terminating pool %s", str(pool))
                        pool.terminate()
                    finished = True
                    break

            at_least_1_runs = False
            if not finished:            
                self.results_lock.acquire()
                finished = sorted(list(self.results.keys()))
                self.results_lock.release()
                if all_keys != finished:
                    logging.debug("Still waiting for priorities %s to finish", str(list(set(all_keys) - set(finished))))
                    at_least_1_runs = True

        for pool in pool_dict.values():
            if pool._state != multiprocessing.pool.TERMINATE:
                pool.join()

        if not self.errors.empty():
            raise RuntimeError("%i error(s) occured during reading of artifact list." % self.errors.qsize())

        return self._get_artifact_list()
    def __init__(self,
                 directory,
                 window_size,
                 window_stride,
                 window_type,
                 normalize,
                 max_len=101,
                 target_size=(256, 256),
                 color_mode='grayscale',
                 classes=None,
                 class_mode='categorical',
                 batch_size=32,
                 shuffle=True,
                 seed=None,
                 data_format=None,
                 save_to_dir=None,
                 save_prefix='',
                 save_format='png',
                 follow_links=False,
                 interpolation='nearest',
                 augment=False,
                 allow_speedandpitch=False,
                 allow_pitch=False,
                 allow_speed=False,
                 allow_dyn=False,
                 allow_noise=False,
                 allow_timeshift=False):
        if data_format is None:
            data_format = K.image_data_format()
        self.window_size = window_size
        self.window_stride = window_stride
        self.window_type = window_type
        self.normalize = normalize
        self.max_len = max_len
        self.directory = directory
        self.allow_speedandpitch = allow_speedandpitch
        self.allow_pitch = allow_pitch
        self.allow_speed = allow_speed
        self.allow_dyn = allow_dyn
        self.allow_noise = allow_noise
        self.allow_timeshift = allow_timeshift
        self.augment = augment
        #        self.image_data_generator = image_data_generator
        self.target_size = tuple(target_size)
        if color_mode not in {'rgb', 'grayscale'}:
            raise ValueError('Invalid color mode:', color_mode,
                             '; expected "rgb" or "grayscale".')
        self.color_mode = color_mode
        self.data_format = data_format
        if self.color_mode == 'rgb':
            if self.data_format == 'channels_last':
                self.image_shape = self.target_size + (3, )
            else:
                self.image_shape = (3, ) + self.target_size
        else:
            if self.data_format == 'channels_last':
                self.image_shape = self.target_size + (1, )
            else:
                self.image_shape = (1, ) + self.target_size
        self.classes = classes
        if class_mode not in {
                'categorical', 'binary', 'sparse', 'input', None
        }:
            raise ValueError(
                'Invalid class_mode:', class_mode,
                '; expected one of "categorical", '
                '"binary", "sparse", "input"'
                ' or None.')
        self.class_mode = class_mode
        self.save_to_dir = save_to_dir
        self.save_prefix = save_prefix
        self.save_format = save_format
        self.interpolation = interpolation

        white_list_formats = {'png', 'jpg', 'jpeg', 'bmp', 'ppm', 'wav'}

        # first, count the number of samples and classes
        self.samples = 0

        if not classes:
            classes = []
            for subdir in sorted(os.listdir(directory)):
                if os.path.isdir(os.path.join(directory, subdir)):
                    classes.append(subdir)
        self.num_classes = len(classes)
        self.class_indices = dict(zip(classes, range(len(classes))))

        pool = multiprocessing.pool.ThreadPool()
        function_partial = partial(_count_valid_files_in_directory,
                                   white_list_formats=white_list_formats,
                                   follow_links=follow_links)
        self.samples = sum(
            pool.map(function_partial,
                     (os.path.join(directory, subdir) for subdir in classes)))

        print('Found %d images belonging to %d classes.' %
              (self.samples, self.num_classes))

        # second, build an index of the images in the different class subfolders
        results = []

        self.filenames = []
        self.classes = np.zeros((self.samples, ), dtype='int32')
        i = 0
        for dirpath in (os.path.join(directory, subdir) for subdir in classes):
            results.append(
                pool.apply_async(_list_valid_filenames_in_directory,
                                 (dirpath, white_list_formats,
                                  self.class_indices, follow_links)))

        for res in results:
            classes, filenames = res.get()
            self.classes[i:i + len(classes)] = classes
            self.filenames += filenames
            if i == 0:
                img = spect_loader(os.path.join(self.directory, filenames[0]),
                                   self.window_size, self.window_stride,
                                   self.window_type, self.normalize,
                                   self.max_len, self.augment,
                                   self.allow_speedandpitch, self.allow_pitch,
                                   self.allow_speed, self.allow_dyn,
                                   self.allow_noise, self.allow_timeshift)
                img = np.swapaxes(img, 0, 2)
                self.target_size = tuple((img.shape[0], img.shape[1]))
                print(self.target_size)
                if self.color_mode == 'rgb':
                    if self.data_format == 'channels_last':
                        self.image_shape = self.target_size + (3, )
                    else:
                        self.image_shape = (3, ) + self.target_size
                else:
                    if self.data_format == 'channels_last':
                        self.image_shape = self.target_size + (1, )
                    else:
                        self.image_shape = (1, ) + self.target_size

            i += len(classes)
        pool.close()
        pool.join()
        super(SpeechDirectoryIterator, self).__init__(self.samples, batch_size,
                                                      shuffle, seed)
Beispiel #32
0
def glitch_code(test_code,
                architecture,
                bytes_to_trash_in,
                flip_type,
                pool_instances,
                enable_cache=True,
                force_invalid=False):
    """

    :param code:
    :param bytes_to_trash: the index into the bytes (these must be sequential!)
    :param architecture:
    :return:
    """
    global code_initial, bytes_to_trash, flip_operation, result_cache
    global force_invalid_ins

    force_invalid_ins = force_invalid

    if not enable_cache:
        result_cache = None

    bytes_to_trash = bytes_to_trash_in

    if not init_architecture(architecture):
        return {}

    # assemble the program
    code_initial, count = KEYSTONE.asm(test_code)

    code = ''.join(map(chr, code_initial))
    asm = list(CAPSTONE.disasm(code, len(code_initial)))
    if len(asm) == 0:
        logger.warning('>>> \tdisasm failure'.format(code.encode('hex')))
    for ins in asm:
        logger.info('>>> {}\t{} {}'.format(code.encode('hex'), ins.mnemonic,
                                           ins.op_str))

    flip_operation = flip_type
    flip_type_str = flip_type.name

    # Init our pool
    if pool_instances is None:
        pool = MyPool()
    else:
        pool = MyPool(pool_instances)

    initial_bytes = [code_initial[x] for x in bytes_to_trash]
    results = {
        'input': {
            'initial_bytes': initial_bytes,
            'initial_code': code_initial
        },
        flip_type_str: {}
    }

    bit_list = range(len(bytes_to_trash) * 8)
    bit_count = range(len(bytes_to_trash) * 8 +
                      1)  # e.g., 0 to 16 bits to flip to include edge cases
    for number_of_bits_to_flip in bit_count:

        # Init our results
        results[flip_type_str][number_of_bits_to_flip] = {}

        # Let's fire off all of our threads in the pool
        logger.info("* Trying %d bit flips (%s)..." %
                    (number_of_bits_to_flip, flip_type_str))
        rtn_vals = pool.imap(
            test_program,
            itertools.combinations(bit_list, number_of_bits_to_flip))

        # Aggregate all of our results
        for rtn in rtn_vals:
            if rtn not in results[flip_type_str][number_of_bits_to_flip]:
                results[flip_type_str][number_of_bits_to_flip][rtn] = 1
            else:
                results[flip_type_str][number_of_bits_to_flip][rtn] += 1

        # Let's print them at each iteration to have some idea of progress
        # pprint.pprint(results[flip_type_str][number_of_bits_to_flip])

        logger.info(
            pprint.pformat(results[flip_type_str][number_of_bits_to_flip]))
        # logger.info(pprint.pformat(results))

        result_cache = {}

    # Close our pool up
    pool.close()
    pool.terminate()
    pool.join()

    # Clear cache, it's unlikely to be useful in the next run
    if result_cache is not None:
        result_cache = {}

    return results
Beispiel #33
0
def buildDecisionTree(df, root, file, config, dataset_features, parent_level = 0, leaf_id = 0, parents = 'root', validation_df = None):
	
	models = []
	feature_names = df.columns[0:-1]
	
	enableParallelism = config['enableParallelism']
	algorithm = config['algorithm']
	
	json_file = file.split(".")[0]+".json"
	
	if root == 1:
		if config['enableRandomForest'] != True and config['enableGBM'] != True and config['enableAdaboost'] != True:
			raw_df = df.copy()
	
	#--------------------------------------
	
	df_copy = df.copy()
	
	winner_name, num_of_instances, metric, metric_name = findDecision(df, config)
	
	#find winner index, this cannot be returned by find decision because columns dropped in previous steps
	j = 0 
	for i in dataset_features:
		if i == winner_name:
			winner_index = j
		j = j + 1
	
	numericColumn = False
	if dataset_features[winner_name] != 'object':
		numericColumn = True
	
	#restoration
	columns = df.shape[1]
	for i in range(0, columns-1):
		column_name = df.columns[i]; column_type = df[column_name].dtypes
		if column_type != 'object' and column_name != winner_name:
			df[column_name] = df_copy[column_name]
	
	classes = df[winner_name].value_counts().keys().tolist()
		
	#-----------------------------------------------------
	
	num_cores = config["num_cores"]
	
	input_params = []
	
	#serial approach
	for i in range(0,len(classes)):
		current_class = classes[i]
		subdataset = df[df[winner_name] == current_class]
		subdataset = subdataset.drop(columns=[winner_name])
		branch_index = i * 1
		
		#create branches serially
		if enableParallelism != True:
			
			if i == 0:
				#descriptor = "# Feature: "+winner_name+", Instances: "+str(num_of_instances)+", "+metric_name+": "+str(round(metric, 4))
				
				descriptor = {
					"feature": winner_name,
					"instances": num_of_instances,
					#"metric_name": metric_name,
					"metric_value": round(metric, 4),
					"depth": parent_level + 1
				}
				descriptor = "# "+json.dumps(descriptor)
				
				functions.storeRule(file, (functions.formatRule(root), "", descriptor))
			
			createBranch(config, current_class, subdataset, numericColumn, branch_index
				, winner_name, winner_index, root, parents, file, dataset_features, num_of_instances, metric)
		else:
			input_params.append((config, current_class, subdataset, numericColumn, branch_index
				, winner_name, winner_index, root, parents, file, dataset_features, num_of_instances, metric))
	
	#---------------------------
	#add else condition in the decision tree
	
	if df.Decision.dtypes == 'object': #classification
		pivot = pd.DataFrame(subdataset.Decision.value_counts()).reset_index()
		pivot = pivot.rename(columns = {"Decision": "Instances","index": "Decision"})
		pivot = pivot.sort_values(by = ["Instances"], ascending = False).reset_index()
		
		else_decision = "return '%s'" % (pivot.iloc[0].Decision)
		
		if enableParallelism != True:
			functions.storeRule(file,(functions.formatRule(root), "else:"))
			functions.storeRule(file,(functions.formatRule(root+1), else_decision))
		else: #parallelism
			leaf_id = str(uuid.uuid1())
			custom_rule_file = "outputs/rules/"+str(leaf_id)+".txt"
			
			check_rule = "else: "+else_decision
			
			sample_rule = {}
			sample_rule["current_level"] = root
			sample_rule["leaf_id"] = leaf_id
			sample_rule["parents"] = parents
			sample_rule["rule"] = check_rule
			sample_rule["feature_idx"] = -1
			sample_rule["feature_name"] = ""
			sample_rule["instances"] = df.shape[0]
			sample_rule["metric"] = 0
			sample_rule["return_statement"] = 0
			
			#json to string
			sample_rule = json.dumps(sample_rule)
			
			functions.createFile(custom_rule_file, "")
			functions.storeRule(custom_rule_file, sample_rule)
			
	else: #regression
		else_decision = "return %s" % (subdataset.Decision.mean())
				
		if enableParallelism != True:
			functions.storeRule(file,(functions.formatRule(root), "else:"))
			functions.storeRule(file,(functions.formatRule(root+1), else_decision))
		else:
			leaf_id = str(uuid.uuid1())
			custom_rule_file = "outputs/rules/"+str(leaf_id)+".txt"
			
			check_rule = "else: "+else_decision
			
			sample_rule = "   {\n"
			sample_rule += "      \"current_level\": "+str(root)+",\n"
			sample_rule += "      \"leaf_id\": \""+str(leaf_id)+"\",\n"
			sample_rule += "      \"parents\": \""+parents+"\",\n"
			sample_rule += "      \"rule\": \""+check_rule+"\"\n"
			sample_rule += "   }"
			
			functions.createFile(custom_rule_file, "")
			functions.storeRule(custom_rule_file, sample_rule)
	
	#---------------------------
	
	#create branches in parallel
	if enableParallelism == True:
		"""
		#this usage causes trouble for recursive functions
		with Pool(number_of_cpus) as pool:
			pool.starmap(createBranch, input_params)
		"""
		
		pool = MyPool(num_cores)
		results = pool.starmap(createBranch, input_params)
		pool.close()
		pool.join()
	
	#---------------------------------------------
	
	if root == 1:
		
		if enableParallelism == True:

			#custom rules are stored in .txt files. merge them all in a json file
			
			functions.createFile(json_file, "[\n")
			
			custom_rules = []
			
			file_index = 0
			for file in os.listdir(os.getcwd()+"/outputs/rules"):
				if file.endswith(".txt"):
					custom_rules.append(os.getcwd()+"/outputs/rules/"+file)
					#print(file) #this file stores a custom rule
					f = open(os.getcwd()+"/outputs/rules/"+file, "r")
					custom_rule = f.read()
					
					if file_index > 0:
						custom_rule = ", "+custom_rule
					
					functions.storeRule(json_file, custom_rule)
					f.close()
					file_index = file_index + 1
					
			functions.storeRule(json_file, "]")
			
			#-----------------------------------
			
			#custom rules are already merged in a json file. clear messy custom rules
			#TO-DO: if random forest trees are handled in parallel, this would be a problem. You cannot know the related tree of a rule. You should store a global tree id in a rule.
			
			for file in custom_rules:
				os.remove(file)
			
			#-----------------------------------
			
			reconstructRules(json_file, feature_names)

			#feature importance should be calculated by demand?
			feature_importance(json_file, dataset_features)
			
			#-----------------------------------
		
		#is regular decision tree
		if config['enableRandomForest'] != True and config['enableGBM'] != True and config['enableAdaboost'] != True:
		#this is reguler decision tree. find accuracy here.
			
			moduleName = "outputs/rules/rules"
			fp, pathname, description = imp.find_module(moduleName)
			myrules = imp.load_module(moduleName, fp, pathname, description) #rules0
			models.append(myrules)
			
	return models
Beispiel #34
0
def _run_next_graph_nodes(graph, node, globals_, locals_, pool):

    operator = graph.node[node].get('OPERATOR', None)

    nodes_return_value = []

    return_value = None

    # False? Terminate Flow.

    if isinstance(locals_['_'], bool) and locals_['_'] is False:

        return False

    if operator:

        #   -->  (a)
        #   --> / | \
        #    (b) (c) (d)
        #       \ | /
        #        (e)

        next_nodes = sorted(graph.successors(node))

        # N-1

        for next_node in next_nodes[1:]:

            # Synchronous

            if operator == '|':

                nodes_return_value.append(
                    pool.apply(_run,
                               args=(graph, next_node, globals_, locals_, {},
                                     None, False)))

            # Asynchronous

            if operator == '->':

                nodes_return_value.append(
                    pool.apply_async(_run,
                                     args=(graph, next_node, globals_, locals_,
                                           {}, None, False)))

        # 1

        nodes_return_value.insert(
            0, _run(graph, next_nodes[0], globals_, locals_, {}, None, False))

        pool.close()

        pool.join()

        pool.terminate()

        return_value = __resolve_and_merge_results(nodes_return_value)

    else:

        #        (a)
        #       / | \
        #    (b) (c) (d)
        #       \ | /
        #    --> (e)

        return_value = locals_['_']

    return return_value
Beispiel #35
0
    def __init__(self,
                 directory,
                 image_data_generator,
                 target_size=(256, 256),
                 color_mode='rgb',
                 classes=None,
                 class_mode='categorical',
                 batch_size=32,
                 shuffle=True,
                 seed=None,
                 data_format='channels_last',
                 save_to_dir=None,
                 save_prefix='',
                 save_format='png',
                 follow_links=False,
                 subset=None,
                 interpolation='nearest',
                 dtype='float32'):
        super(DirectoryIterator,
              self).common_init(image_data_generator, target_size, color_mode,
                                data_format, save_to_dir, save_prefix,
                                save_format, subset, interpolation)
        self.directory = directory
        self.classes = classes
        if class_mode not in self.allowed_class_modes:
            raise ValueError(
                'Invalid class_mode: {}; expected one of: {}'.format(
                    class_mode, self.allowed_class_modes))
        self.class_mode = class_mode
        self.dtype = dtype
        # First, count the number of samples and classes.
        self.samples = 0

        if not classes:
            classes = []
            for subdir in sorted(os.listdir(directory)):
                if os.path.isdir(os.path.join(directory, subdir)):
                    classes.append(subdir)
        self.num_classes = len(classes)
        self.class_indices = dict(zip(classes, range(len(classes))))

        pool = multiprocessing.pool.ThreadPool()

        # Second, build an index of the images
        # in the different class subfolders.
        results = []
        self.filenames = []
        i = 0
        for dirpath in (os.path.join(directory, subdir) for subdir in classes):
            results.append(
                pool.apply_async(_list_valid_filenames_in_directory,
                                 (dirpath, self.white_list_formats, self.split,
                                  self.class_indices, follow_links)))
        classes_list = []
        for res in results:
            classes, filenames = res.get()
            classes_list.append(classes)
            self.filenames += filenames
        self.samples = len(self.filenames)
        self.classes = np.zeros((self.samples, ), dtype='int32')
        for classes in classes_list:
            self.classes[i:i + len(classes)] = classes
            i += len(classes)

        print('Found %d images belonging to %d classes.' %
              (self.samples, self.num_classes))
        pool.close()
        pool.join()
        super(DirectoryIterator, self).__init__(self.samples, batch_size,
                                                shuffle, seed)
    def __init__(self,
                 directory,
                 image_data_generator,
                 augmentations=None,
                 target_size=(256, 256, 256),
                 num_channels=1,
                 num_patches=1,
                 classes=None,
                 class_mode='categorical',
                 batch_size=32,
                 shuffle=True,
                 seed=None,
                 save_to_dir=None,
                 save_prefix='',
                 save_format='png',
                 axial_slice=None,
                 follow_links=False,
                 split=None):
        self.directory = directory
        self.image_data_generator = image_data_generator
        self.augmentations = augmentations
        self.target_size = tuple(target_size)
        self.num_channels = num_channels
        self.num_patches = num_patches
        self.image_shape = self.target_size + (num_channels, )
        self.classes = classes
        if class_mode not in {
                'categorical', 'binary', 'sparse', 'input', None
        }:
            raise ValueError(
                'Invalid class_mode:', class_mode,
                '; expected one of "categorical", '
                '"binary", "sparse", "input"'
                ' or None.')
        self.class_mode = class_mode

        self.save_to_dir = save_to_dir
        self.save_prefix = save_prefix
        self.save_format = save_format
        self.axial_slice = axial_slice

        white_list_formats = {'nii', 'nii.gz'}

        # Counter number of samples and classes
        self.samples = 0

        if not classes:
            classes = []
            for subdir in sorted(os.listdir(directory)):
                if os.path.isdir(os.path.join(directory, subdir)):
                    classes.append(subdir)
        self.num_classes = len(classes)
        self.class_indices = dict(zip(classes, range(len(classes))))

        pool = multiprocessing.pool.ThreadPool()

        function_partial = partial(_count_valid_files_in_directory,
                                   white_list_formats=white_list_formats,
                                   follow_links=follow_links,
                                   split=None)
        self.samples = sum(
            pool.map(function_partial,
                     (os.path.join(directory, subdir) for subdir in classes)))
        print('Found %d images belonging to %d classes.' %
              (self.samples, self.num_classes))

        # Build an index of the images in the different class subfolders
        results = []
        self.filenames = []
        self.classes = np.zeros((self.samples, ), dtype='int32')
        i = 0
        for dirpath in (os.path.join(directory, subdir) for subdir in classes):
            results.append(
                pool.apply_async(_list_valid_filenames_in_directory,
                                 (dirpath, white_list_formats, split,
                                  self.class_indices, follow_links)))
        for res in results:
            classes, filenames = res.get()
            self.classes[i:i + len(classes)] = classes
            self.filenames += filenames
            i += len(classes)

        pool.close()
        pool.join()

        super(NIfTIDirectoryIterator, self).__init__(self.samples, batch_size,
                                                     shuffle, seed)
Beispiel #37
0
    def __init__(self,
                 pathlists,
                 classes,
                 image_data_generator,
                 target_size=(256, 256),
                 color_mode='rgb',
                 class_mode='categorical',
                 batch_size=32,
                 shuffle=True,
                 seed=None,
                 data_format='channels_last',
                 save_to_dir=None,
                 save_prefix='',
                 save_format='png',
                 subset=None,
                 interpolation='nearest',
                 dtype='float32'):
        if data_format is None:
            data_format = backend.image_data_format()
        self.image_data_generator = image_data_generator
        self.target_size = tuple(target_size)
        if color_mode not in {'rgb', 'rgba', 'grayscale'}:
            raise ValueError('Invalid color mode:', color_mode,
                             '; expected "rgb", "rgba", or "grayscale".')
        self.color_mode = color_mode
        self.data_format = data_format
        if self.color_mode == 'rgba':
            if self.data_format == 'channels_last':
                self.image_shape = self.target_size + (4, )
            else:
                self.image_shape = (4, ) + self.target_size
        elif self.color_mode == 'rgb':
            if self.data_format == 'channels_last':
                self.image_shape = self.target_size + (3, )
            else:
                self.image_shape = (3, ) + self.target_size
        else:
            if self.data_format == 'channels_last':
                self.image_shape = self.target_size + (1, )
            else:
                self.image_shape = (1, ) + self.target_size
        self.save_to_dir = save_to_dir
        self.save_prefix = save_prefix
        self.save_format = save_format
        self.interpolation = interpolation

        if subset is not None:
            validation_split = self.image_data_generator._validation_split
            if subset == 'validation':
                split = (0, validation_split)
            elif subset == 'training':
                split = (validation_split, 1)
            else:
                raise ValueError('Invalid subset name: ', subset,
                                 '; expected "training" or "validation"')
        else:
            split = None
        self.subset = subset

        self.pathlists = pathlists
        self.classes = classes

        if class_mode not in {
                'categorical', 'binary', 'sparse', 'input', None
        }:
            raise ValueError(
                'Invalid class_mode:', class_mode,
                '; expected one of "categorical", '
                '"binary", "sparse", "input"'
                ' or None.')
        self.class_mode = class_mode
        self.dtype = dtype

        white_list_formats = {
            'png', 'jpg', 'jpeg', 'bmp', 'ppm', 'tif', 'tiff'
        }
        # First, count the number of samples and classes.
        self.samples = 0

        self.num_classes = len(classes)
        self.class_indices = dict(zip(classes, range(len(classes))))

        pool = multiprocessing.pool.ThreadPool()

        self.samples = sum(pool.map(len, pathlists))

        print('Found %d images belonging to %d classes.' %
              (self.samples, self.num_classes),
              file=sys.stderr)

        # Second, build an index of the images
        # in the different class subfolders.
        results = []
        self.filenames = []
        self.classes = np.zeros((self.samples, ), dtype='int32')

        i = 0
        for pathlist, class_index in zip(pathlists,
                                         self.class_indices.items()):
            self.filenames += pathlist
            self.classes[i:i + len(pathlist)] = class_index[1]
            i += len(pathlist)

        pool.close()
        pool.join()

        super(PathListsIterator, self).__init__(self.samples, batch_size,
                                                shuffle, seed)
Beispiel #38
0
def _spider(url, visited, root, depth, max_depth, raise_on_error):
    """Fetches URL and any pages it links to up to max_depth.

       depth should initially be zero, and max_depth is the max depth of
       links to follow from the root.

       Prints out a warning only if the root can't be fetched; it ignores
       errors with pages that the root links to.

       Returns a tuple of:
       - pages: dict of pages visited (URL) mapped to their full text.
       - links: set of links encountered while visiting the pages.
    """
    pages = {}  # dict from page URL -> text content.
    links = set()  # set of all links seen on visited pages.

    # root may end with index.html -- chop that off.
    if root.endswith('/index.html'):
        root = re.sub('/index.html$', '', root)

    try:
        response_url, page = _read_from_url(url, 'text/html')

        if not response_url or not page:
            return pages, links

        pages[response_url] = page

        # Parse out the links in the page
        link_parser = LinkParser()
        subcalls = []
        link_parser.feed(page)

        while link_parser.links:
            raw_link = link_parser.links.pop()
            abs_link = urljoin(response_url, raw_link.strip())

            links.add(abs_link)

            # Skip stuff that looks like an archive
            if any(raw_link.endswith(suf) for suf in ALLOWED_ARCHIVE_TYPES):
                continue

            # Skip things outside the root directory
            if not abs_link.startswith(root):
                continue

            # Skip already-visited links
            if abs_link in visited:
                continue

            # If we're not at max depth, follow links.
            if depth < max_depth:
                subcalls.append((abs_link, visited, root, depth + 1, max_depth,
                                 raise_on_error))
                visited.add(abs_link)

        if subcalls:
            pool = NonDaemonPool(processes=len(subcalls))
            try:
                results = pool.map(_spider_wrapper, subcalls)

                for sub_pages, sub_links in results:
                    pages.update(sub_pages)
                    links.update(sub_links)

            finally:
                pool.terminate()
                pool.join()

    except URLError as e:
        tty.debug(e)

        if hasattr(e, 'reason') and isinstance(e.reason, ssl.SSLError):
            tty.warn("Spack was unable to fetch url list due to a certificate "
                     "verification problem. You can try running spack -k, "
                     "which will not check SSL certificates. Use this at your "
                     "own risk.")

        if raise_on_error:
            raise NoNetworkConnectionError(str(e), url)

    except HTMLParseError as e:
        # This error indicates that Python's HTML parser sucks.
        msg = "Got an error parsing HTML."

        # Pre-2.7.3 Pythons in particular have rather prickly HTML parsing.
        if sys.version_info[:3] < (2, 7, 3):
            msg += " Use Python 2.7.3 or newer for better HTML parsing."

        tty.warn(msg, url, "HTMLParseError: " + str(e))

    except Exception as e:
        # Other types of errors are completely ignored, except in debug mode.
        tty.debug("Error in _spider: %s:%s" % (type(e), e),
                  traceback.format_exc())

    return pages, links
Beispiel #39
0
def run_instances(database, instances, filter_string, ubxlib_dir, working_dir,
                  clean, summary_report_file, test_report_file, debug_file):
    '''Run the given instances'''
    return_value = 0
    processes = []
    platform_locks = []
    misc_locks = {}
    alive_count = 0
    report_thread = None
    report_queue = None
    reporter = None
    summary_report_file_path = None
    test_report_file_path = None
    debug_file_path = None
    summary_report_handle = None

    manager = Manager()

    # Create a lock to cover things that cross
    # platforms or that any process of u_run.main()
    # may need to perform outside of its working
    # directory
    misc_locks["system_lock"] = manager.RLock()

    # Create a lock which can be used on Nordic
    # platforms (nRF5 and Zephyer): performing a
    # JLink download to a board while JLink RTT logging
    # is active on any other board will often stop
    # the RTT logging even though the sessions are
    # aimed at debuggers with entirely different
    # serial numbers.
    misc_locks["jlink_lock"] = manager.RLock()

    # Create a "lock" that can be used on STM32F4
    # platforms to ensure that all downloads are
    # completed before logging commences.  We
    # can do this, rather than locking a tool for the
    # whole time as we have to do with Nordic, because
    # each STM32F4 board only runs a single instance
    misc_locks["stm32f4_downloads_list"] = manager.list()

    # It is possible for some platforms to be a bit
    # pants at running in multiple instances
    # hence here we create a lock per platform and pass it
    # into the instance for it to be able to manage
    # multiplicity if required
    create_platform_locks(database, instances, manager, platform_locks)

    # Launch a thread that prints stuff out
    # nicely from multiple sources
    print_queue = manager.Queue()
    print_thread = u_utils.PrintThread(print_queue)
    print_thread.start()

    # Set up a printer for this thread to print to the queue
    printer = u_utils.PrintToQueue(print_queue, None, True)

    if summary_report_file:
        # Launch a thread that manages reporting
        # from multiple sources
        summary_report_file_path = working_dir + os.sep + summary_report_file
        summary_report_handle = open(summary_report_file_path, "w")
        if summary_report_handle:
            printer.string("{}writing overall summary report to \"{}\".".  \
                           format(PROMPT, summary_report_file_path))
        else:
            printer.string("{}unable to open file \"{}\" for overall summary report.".   \
                           format(PROMPT, summary_report_file_path))
        report_queue = manager.Queue()
        report_thread = u_report.ReportThread(report_queue,
                                              summary_report_handle)
        report_thread.start()
        reporter = u_report.ReportToQueue(report_queue, None, None, printer)
        reporter.open()

    # From this post:
    # https://stackoverflow.com/questions/11312525/catch-ctrlc-sigint-and-exit-multiprocesses-gracefully-in-python
    # ...create a pool of worker processes to run our
    # instances, then they will handle sigint correctly
    # and tidy up after themselves.

    # SIGINT is ignored while the pool is created
    original_sigint_handler = signal(SIGINT, SIG_IGN)
    pool = NoDaemonPool(len(instances))
    signal(SIGINT, original_sigint_handler)

    # Create locks for connections
    u_connection.init_locks(manager)

    try:
        # Set up all the instances
        for instance in instances:
            # Provide a working directory that is unique
            # for each instance and make sure it exists
            if working_dir:
                this_working_dir = working_dir + os.sep +       \
                                   INSTANCE_DIR_PREFIX + \
                                   u_utils.get_instance_text(instance).replace(".", "_")
            else:
                this_working_dir = os.getcwd() + os.sep +       \
                                   INSTANCE_DIR_PREFIX + \
                                   u_utils.get_instance_text(instance).replace(".", "_")
            if not os.path.isdir(this_working_dir):
                os.makedirs(this_working_dir)
            # Only clean the working directory if requested
            if clean:
                u_utils.deltree(this_working_dir, printer, PROMPT)
                os.makedirs(this_working_dir)

            # Create the file paths for this instance
            if summary_report_file:
                summary_report_file_path = this_working_dir + os.sep + summary_report_file
            if test_report_file:
                test_report_file_path = this_working_dir + os.sep + test_report_file
            if debug_file:
                debug_file_path = this_working_dir + os.sep + debug_file

            # Start u_run.main in each worker thread
            process = {}
            process["platform"] = u_data.get_platform_for_instance(
                database, instance)
            process["instance"] = instance
            process["platform_lock"] = None
            process["connection_lock"] = u_connection.get_lock(instance)
            for platform_lock in platform_locks:
                if process["platform"] == platform_lock["platform"]:
                    process["platform_lock"] = platform_lock["lock"]
                    break
            process["handle"] = pool.apply_async(
                u_run.main,
                (database, instance, filter_string, True, ubxlib_dir,
                 this_working_dir, process["connection_lock"],
                 process["platform_lock"], misc_locks, print_queue,
                 report_queue, summary_report_file_path, test_report_file_path,
                 debug_file_path))
            alive_count += 1
            processes.append(process.copy())

        # Wait for all the launched processes to complete
        printer.string("{}all instances now launched.".format(PROMPT))
        loop_count = 0
        while alive_count > 0:
            for process in processes:
                instance_text = u_utils.get_instance_text(process["instance"])
                if not "dealt_with" in process and process["handle"].ready():
                    try:
                        # If the return value has gone negative, i.e.
                        # an infrastructure failure, leave it there,
                        # else add the number of test failures to it
                        if (return_value >= 0 and process["handle"].get() > 0) or \
                            (return_value <= 0 and process["handle"].get() < 0):
                            return_value += process["handle"].get()
                    except KeyboardInterrupt as ex:
                        raise KeyboardInterrupt from ex
                    except Exception as ex:
                        # If an instance threw an exception then flag an
                        # infrastructure error
                        return_value = -1
                        printer.string("{}instance {} threw exception \"{}:"    \
                                       " {}\" but I can't tell you where"       \
                                       " I'm afraid.".                          \
                                       format(PROMPT, instance_text,
                                              type(ex).__name__, str(ex)))
                        if reporter:
                            reporter.event(u_report.EVENT_TYPE_INFRASTRUCTURE,
                                           u_report.EVENT_FAILED,
                                           "instance {} threw exception \"{}: {}\"". \
                                           format(instance_text, type(ex).__name__,
                                                  str(ex)))
                    alive_count -= 1
                    process["dealt_with"] = True
                if not process["handle"].ready() and                         \
                   (loop_count == STILL_RUNNING_REPORT_SECONDS):
                    printer.string("{}instance {} still running.".           \
                                        format(PROMPT, instance_text))
            loop_count += 1
            if loop_count > STILL_RUNNING_REPORT_SECONDS:
                loop_count = 0
            sleep(1)

    except KeyboardInterrupt:
        # Pools can tidy themselves up on SIGINT
        printer.string(
            "{}caught CTRL-C, terminating instances...".format(PROMPT))
        if reporter:
            reporter.event(u_report.EVENT_TYPE_INFRASTRUCTURE,
                           u_report.EVENT_FAILED,
                           "CTRL-C received, terminating")
        pool.terminate()
        return_value = -1

    # Tidy up
    pool.close()
    pool.join()
    if reporter:
        reporter.event_extra_information("return value overall {} (0 = success, negative ="   \
                                         " probable infrastructure failure, positive ="       \
                                         " failure(s) (may still be due to infrastructure))". \
                                         format(return_value))
        reporter.close()

    # Wait for the print and report queues to empty
    # and stop the print process
    printer.string("{}all runs complete, return value {}.".format(
        PROMPT, return_value))
    sleep(1)
    print_thread.stop_thread()
    print_thread.join()

    # Stop the reporting process
    if report_thread:
        report_thread.stop_thread()
        report_thread.join()

    if summary_report_handle:
        summary_report_handle.close()

    return return_value
    def stest(self, repeat, threshold, mp):

        print("\n\tIntersection random subsampling test:\n    Repeat " +
              str(repeat) + " times\n")
        self.test_time = repeat
        self.test_d = {}
        plist = OrderedDict()

        for ty in self.groupedreference.keys():
            self.test_d[ty] = {}
            plist[ty] = OrderedDict()
            for r in self.groupedreference[ty]:
                if r.name in self.nalist:
                    continue
                print("\t" + r.name)
                self.test_d[ty][r.name] = {}
                plist[ty][r.name] = OrderedDict()
                print("\t.", end="")
                sys.stdout.flush()
                for q in self.groupedquery[ty]:
                    if r.name == q.name:
                        continue
                    else:
                        print(".", end="")
                        sys.stdout.flush()
                        if q.name in self.nalist:
                            continue
                        # True intersection
                        obs = self.counts[ty][r.name][q.name]
                        qn = q.name
                        if obs[2] == 0:
                            aveinter, chisq, p = "NA", "NA", "1"
                        else:
                            com = q.combine(r, change_name=False, output=True)
                            # Randomization
                            d = []

                            inp = [
                                com, self.rlen[ty][r.name], self.mode_count,
                                threshold
                            ]
                            mp_input = [inp for i in range(repeat)]

                            pool = multiprocessing.Pool(processes=mp)
                            mp_output = pool.map(mp_count_intersets, mp_input)
                            pool.close()
                            pool.join()

                            # for i in range(repeat):
                            #    random_r,random_q = com.random_split(size=self.rlen[ty][r.name])
                            #    d.append(random_r.intersect_count(random_q, mode_count=self.mode_count, threshold=threshold))
                            # d.append(count_intersect(random_r, random_q, mode_count=self.mode_count, threshold=threshold))
                            da = numpy.array(mp_output)

                            exp_m = numpy.mean(da, axis=0)
                            # print(exp_m)
                            # print(obs)
                            chisq, p, dof, expected = stats.chi2_contingency(
                                [exp_m, obs])
                            aveinter = exp_m[2]

                        plist[ty][r.name][qn] = p
                        self.test_d[ty][r.name][qn] = [aveinter, chisq, p]
                print()

            multiple_correction(plist)

            # c_p = 0
            for r in self.test_d[ty].keys():
                if r in self.nalist:
                    continue
                for q in self.test_d[ty][r].keys():
                    self.test_d[ty][r][q][2] = plist[ty][r][q]
    def count_intersect(self, threshold, frequency=True):

        self.counts = OrderedDict()
        self.rlen, self.qlen = {}, {}
        self.nalist = []

        if frequency:
            self.frequency = OrderedDict()

        # if self.mode_count == "bp":
        #    print2(self.parameter, "\n{0}\t{1}\t{2}\t{3}\t{4}".format("Reference","Length(bp)", "Query", "Length(bp)", "Length of Intersection(bp)"))
        # elif self.mode_count == "count":
        #    print2(self.parameter, "\n{0}\t{1}\t{2}\t{3}\t{4}".format("Reference","sequence_number", "Query", "sequence_number", "Number of Intersection"))

        for ty in self.groupedreference.keys():
            self.counts[ty] = OrderedDict()
            self.rlen[ty], self.qlen[ty] = OrderedDict(), OrderedDict()
            if frequency:
                self.frequency[ty] = OrderedDict()

            for r in self.groupedreference[ty]:
                if r.total_coverage() == 0 and len(r) > 0:
                    self.nalist.append(r.name)
                    continue
                else:
                    self.counts[ty][r.name] = OrderedDict()
                    if self.mode_count == "bp":
                        rlen = r.total_coverage()
                    elif self.mode_count == "count":
                        rlen = len(r)
                    self.rlen[ty][r.name] = rlen

                    mp_input = []
                    for q in self.groupedquery[ty]:
                        if r.name == q.name:
                            continue
                        else:
                            mp_input.append([
                                q, self.nalist, self.mode_count, self.qlen,
                                threshold, self.counts, frequency,
                                self.frequency, ty, r
                            ])
                    # q, nalist, mode_count, qlen_dict, threshold, counts, frequency, self_frequency, ty, r
                    pool = multiprocessing.Pool(
                        processes=multiprocessing.cpu_count() - 1)
                    mp_output = pool.map(mp_count_intersect, mp_input)
                    pool.close()
                    pool.join()

                    # qname, nalist, qlen_dict[ty][q.name], counts[ty][r.name][q.name], self_frequency[ty][q.name].append(c[2])
                    for output in mp_output:
                        if output[1]:
                            self.nalist.append(output[1])
                        else:
                            self.qlen[ty][output[0]] = output[2]
                            self.counts[ty][r.name][output[0]] = output[3]
                            # print(r.name)
                            # print(output[0])
                            # print(output[3])
                            try:
                                self.frequency[ty][output[0]][
                                    r.name] = output[3][2]
                            except:
                                self.frequency[ty][output[0]] = {}
                                self.frequency[ty][output[0]][
                                    r.name] = output[3][2]
    def learn(self,
              selected_data,
              verbose=None,
              n_min=1,
              limit=None,
              score_max=fpconst.PosInf,
              score_delta=fpconst.PosInf,
              cores=False,
              picloud=False):

        if verbose:
            print 'Learning parents of', selected_data.vertex.name, '...',
        v = selected_data.vertex
        nd = len(selected_data)
        parents = selected_data.parents
        p_weights = selected_data.weights
        n = len(parents)
        try:
            lim = int(limit)
        except TypeError:
            lim = n

        selected_data_empty = selected_data.subset([])
        mindata = self.lower_bound_for_data_score(selected_data_empty)

        min_set = minset(n_min,score_max,score_delta,self.data_score(selected_data_empty)+\
                self.graph_score(n,v,[],nd))
        if n:
            w_min = p_weights[parents[0]]
            w_max = p_weights[parents[-1]]
            if w_min == w_max:
                if verbose:
                    print "Using algorithm 2"

                weight = w_min
                size = 1

                mg = self.graph_score(n, v, [weight], nd)
                while min_set.accepts(mg + mindata) and (size <= lim):
                    if cores:
                        import multiprocessing
                        import multiprocessing.pool

                        pool = multiprocessing.Pool(cores)
                        sub_obj = list(self.subsets(parents, size))

                        import itertools
                        results = pool.map(looper, [(selected_data, y, self)
                                                    for y in sub_obj])
                        pool.close()
                        pool.join()

                        for result, sub in itertools.izip(results, sub_obj):
                            min_set.add(mg + result, sub)

                    else:
                        for sub in self.subsets(parents, size):
                            selected_data_sub = selected_data.subset(sub)
                            min_set.add(
                                mg + self.data_score(selected_data_sub), sub)

                    size += 1
                    mg = self.graph_score(n, v, [weight] * size, nd)

            else:
                if verbose:
                    print "Using algorithm 1"
                if cores:
                    import multiprocessing
                    import multiprocessing.pool
                    pool = multiprocessing.Pool(cores)
                    size = 1
                    results = [1]
                    while (True in results) and (size <= lim):
                        subs = list(self.subsets(parents, size))
                        scores = pool.map(looper, [(selected_data, y, self)
                                                   for y in subs])

                        mgs = []
                        for sub in subs:
                            weight = 0
                            for parent in sub:
                                weight = weight + p_weights[parent]
                            mgs.append(self.graph_score(n, v, [weight], nd))

                        import itertools
                        for score, sub, mg in itertools.izip(
                                scores, subs, mgs):
                            min_set.add(mg + score, sub)

                        results = pool.map(unwrap_min_set_accepts,
                                           [(min_set, mg + mindata)
                                            for mg in mgs])
                        del mgs, subs, scores
                        size += 1

                    pool.close()
                    pool.join()

                else:
                    subsets = []
                    for parent in parents:
                        heappush(subsets, (self.graph_score(
                            n, v, [p_weights[parent]],
                            nd), [p_weights[parent]], [parent]))
                    while subsets:
                        mg, weights, sub = heappop(subsets)
                        if not min_set.accepts(mg + mindata):
                            break
                        selected_data_sub = selected_data.subset(sub)
                        min_set.add(mg + self.data_score(selected_data_sub),
                                    sub)
                        if len(sub) < lim:
                            last_parent = parents.index(sub[-1])
                            for parent in parents[last_parent + 1:]:
                                sub_succ = sub + [parent]
                                weights_succ = weights + [p_weights[parent]]
                                mg_succ = self.graph_score(
                                    n, v, weights_succ, nd)
                                heappush(subsets,
                                         (mg_succ, weights_succ, sub_succ))

        if verbose:
            print 'done', min_set
        return min_set.optimal, min_set.tolist()
Beispiel #43
0
    def learn_1(self,
                selected_data,
                verbose=None,
                n_min=1,
                limit=None,
                score_max=fpconst.PosInf,
                score_delta=fpconst.PosInf,
                cores=False,
                picloud=False):

        if verbose:
            print 'Learning parents of', selected_data.vertex.name, '...',

#        if not self.sloops:
#            selected_data.rm_sloops()

        v = selected_data.vertex
        nd = len(selected_data)
        parents = selected_data.parents
        p_weights = selected_data.weights
        n = len(parents)
        try:
            lim = int(limit)
        except TypeError:  #limit was None
            lim = n

        selected_data_empty = selected_data.subset([])
        mindata = self.lower_bound_for_data_score(selected_data_empty)

        min_set = minset(n_min,score_max,score_delta,self.data_score(selected_data_empty)+\
                self.graph_score(n,v,[],nd)) #empty parents set
        if n:  # are there any potential parents?
            w_min = p_weights[parents[0]]
            w_max = p_weights[parents[-1]]
            if w_min == w_max:  # we can use algorithm 2
                if verbose:
                    print "Using algorithm 2"

                weight = w_min
                size = 1

                mg = self.graph_score(n, v, [weight], nd)
                while min_set.accepts(mg + mindata) and (
                        size <=
                        lim):  #we can possibly add (sub-)optimal scores

                    # Parallelized version
                    if cores:
                        import multiprocessing
                        import multiprocessing.pool

                        pool = multiprocessing.Pool(cores)
                        sub_obj = list(self.subsets(parents, size))

                        import itertools
                        results = pool.map(looper, [(selected_data, y, self)
                                                    for y in sub_obj])
                        pool.close()
                        pool.join()

                        for result, sub in itertools.izip(results, sub_obj):
                            min_set.add(mg + result, sub)

                    else:
                        for sub in self.subsets(parents, size):
                            #print "sub.size ", len(sub)
                            selected_data_sub = selected_data.subset(sub)
                            min_set.add(
                                mg + self.data_score(selected_data_sub), sub)

                    size += 1
                    mg = self.graph_score(n, v, [weight] * size, nd)

            else:  # we have to use algorithm 1
                if verbose:
                    print "Using algorithm 1"

                # Parallelized version
                if cores:
                    import multiprocessing
                    import multiprocessing.pool
                    pool = multiprocessing.Pool(cores)
                    size = 1
                    results = [1]
                    while (True in results) and (size <= lim):
                        subs = list(self.subsets(parents, size))
                        scores = pool.map(looper, [(selected_data, y, self)
                                                   for y in subs])

                        mgs = []
                        for sub in subs:
                            weight = 0
                            for parent in sub:
                                weight = weight + p_weights[parent]
                            mgs.append(self.graph_score(n, v, [weight], nd))

                        import itertools
                        for score, sub, mg in itertools.izip(
                                scores, subs, mgs):
                            min_set.add(mg + score, sub)

                        results = pool.map(unwrap_min_set_accepts,
                                           [(min_set, mg + mindata)
                                            for mg in mgs])
                        del mgs, subs, scores
                        size += 1

                    pool.close()
                    pool.join()

                else:
                    subsets = [
                    ]  #successors of considered yet potential parents sets
                    for parent in parents:  #one-element parents sets
                        #print "one parent"
                        heappush(subsets, (self.graph_score(
                            n, v, [p_weights[parent]],
                            nd), [p_weights[parent]], [parent]))
                    while subsets:
                        #print subsets
                        mg, weights, sub = heappop(subsets)
                        #print sub
                        if not min_set.accepts(
                                mg + mindata):  #we cannot improve the score
                            break
                        selected_data_sub = selected_data.subset(sub)
                        min_set.add(mg + self.data_score(selected_data_sub),
                                    sub)
                        #insert sub's successors
                        if len(sub) < lim:
                            last_parent = parents.index(sub[-1])
                            for parent in parents[last_parent + 1:]:
                                sub_succ = sub + [parent]
                                weights_succ = weights + [p_weights[parent]]
                                mg_succ = self.graph_score(
                                    n, v, weights_succ, nd)
                                heappush(subsets,
                                         (mg_succ, weights_succ, sub_succ))

        if verbose:
            print 'done', min_set
        return min_set.optimal, min_set.tolist()
Beispiel #44
0
def buildDecisionTree(df,
                      root,
                      file,
                      config,
                      dataset_features,
                      parent_level=0,
                      leaf_id=0,
                      parents='root'):

    models = []

    enableParallelism = config['enableParallelism']
    algorithm = config['algorithm']

    json_file = file.split(".")[0] + ".json"

    if root == 1:
        if config['enableRandomForest'] != True and config[
                'enableGBM'] != True and config['enableAdaboost'] != True:
            raw_df = df.copy()

    #--------------------------------------

    df_copy = df.copy()

    winner_name = findDecision(df, config)

    #find winner index, this cannot be returned by find decision because columns dropped in previous steps
    j = 0
    for i in dataset_features:
        if i == winner_name:
            winner_index = j
        j = j + 1

    numericColumn = False
    if dataset_features[winner_name] != 'object':
        numericColumn = True

    #restoration
    columns = df.shape[1]
    for i in range(0, columns - 1):
        column_name = df.columns[i]
        column_type = df[column_name].dtypes
        if column_type != 'object' and column_name != winner_name:
            df[column_name] = df_copy[column_name]

    classes = df[winner_name].value_counts().keys().tolist()

    #-----------------------------------------------------

    #TO-DO: you should specify the number of cores in config
    num_cores = int(multiprocessing.cpu_count() /
                    2)  #allocate half of your total cores

    input_params = []

    #serial approach
    for i in range(0, len(classes)):
        current_class = classes[i]
        subdataset = df[df[winner_name] == current_class]
        subdataset = subdataset.drop(columns=[winner_name])
        branch_index = i * 1

        #create branches serially
        if enableParallelism != True:
            createBranch(config, current_class, subdataset, numericColumn,
                         branch_index, winner_index, root, parents, file,
                         dataset_features)
        else:
            input_params.append((config, current_class, subdataset,
                                 numericColumn, branch_index, winner_index,
                                 root, parents, file, dataset_features))

    #---------------------------
    #add else condition in the decision tree

    if df.Decision.dtypes == 'object':  #classification
        pivot = pd.DataFrame(subdataset.Decision.value_counts()).reset_index()
        pivot = pivot.rename(columns={
            "Decision": "Instances",
            "index": "Decision"
        })
        pivot = pivot.sort_values(by=["Instances"],
                                  ascending=False).reset_index()

        else_decision = "return '%s'" % (pivot.iloc[0].Decision)

        if enableParallelism != True:
            functions.storeRule(file, (functions.formatRule(root), "else:"))
            functions.storeRule(
                file, (functions.formatRule(root + 1), else_decision))
        else:  #parallelism
            leaf_id = str(uuid.uuid1())
            custom_rule_file = "outputs/rules/" + str(leaf_id) + ".txt"

            check_rule = "else: " + else_decision

            sample_rule = "   {\n"
            sample_rule += "      \"current_level\": " + str(root) + ",\n"
            sample_rule += "      \"leaf_id\": \"" + str(leaf_id) + "\",\n"
            sample_rule += "      \"parents\": \"" + parents + "\",\n"
            sample_rule += "      \"rule\": \"" + check_rule + "\"\n"
            sample_rule += "   }"

            functions.createFile(custom_rule_file, "")
            functions.storeRule(custom_rule_file, sample_rule)

    else:  #regression
        else_decision = "return %s" % (subdataset.Decision.mean())

        if enableParallelism != True:
            functions.storeRule(file, (functions.formatRule(root), "else:"))
            functions.storeRule(
                file, (functions.formatRule(root + 1), else_decision))
        else:
            leaf_id = str(uuid.uuid1())
            custom_rule_file = "outputs/rules/" + str(leaf_id) + ".txt"

            check_rule = "else: " + else_decision

            sample_rule = "   {\n"
            sample_rule += "      \"current_level\": " + str(root) + ",\n"
            sample_rule += "      \"leaf_id\": \"" + str(leaf_id) + "\",\n"
            sample_rule += "      \"parents\": \"" + parents + "\",\n"
            sample_rule += "      \"rule\": \"" + check_rule + "\"\n"
            sample_rule += "   }"

            functions.createFile(custom_rule_file, "")
            functions.storeRule(custom_rule_file, sample_rule)

    #---------------------------

    #create branches in parallel
    if enableParallelism == True:
        """
		#this usage causes trouble for recursive functions
		with Pool(number_of_cpus) as pool:
			pool.starmap(createBranch, input_params)
		"""

        pool = MyPool(num_cores)
        results = pool.starmap(createBranch, input_params)
        pool.close()
        pool.join()

    #---------------------------------------------

    #calculate accuracy metrics
    if root == 1:

        if enableParallelism == True:

            #custom rules are stored in .txt files. merge them all in a json file

            functions.createFile(json_file, "[\n")

            custom_rules = []

            file_index = 0
            for file in os.listdir(os.getcwd() + "/outputs/rules"):
                if file.endswith(".txt"):
                    custom_rules.append(os.getcwd() + "/outputs/rules/" + file)
                    #print(file) #this file stores a custom rule
                    f = open(os.getcwd() + "/outputs/rules/" + file, "r")
                    custom_rule = f.read()

                    if file_index > 0:
                        custom_rule = ", " + custom_rule

                    functions.storeRule(json_file, custom_rule)
                    f.close()
                    file_index = file_index + 1

            functions.storeRule(json_file, "]")

            #-----------------------------------

            #custom rules are already merged in a json file. clear messy custom rules
            #TO-DO: if random forest trees are handled in parallel, this would be a problem. You cannot know the related tree of a rule. You should store a global tree id in a rule.

            for file in custom_rules:
                os.remove(file)

            #-----------------------------------

            reconstructRules(json_file)

            #-----------------------------------

        if config['enableRandomForest'] != True and config[
                'enableGBM'] != True and config['enableAdaboost'] != True:
            #this is reguler decision tree. find accuracy here.

            moduleName = "outputs/rules/rules"
            fp, pathname, description = imp.find_module(moduleName)
            myrules = imp.load_module(moduleName, fp, pathname,
                                      description)  #rules0
            models.append(myrules)

            num_of_features = df.shape[1] - 1
            instances = df.shape[0]
            classified = 0
            mae = 0
            mse = 0

            #instead of for loops, pandas functions perform well
            raw_df['Prediction'] = raw_df.apply(findPrediction, axis=1)
            if algorithm != 'Regression':
                idx = raw_df[raw_df['Prediction'] == raw_df['Decision']].index

                #raw_df['Classified'] = 0
                #raw_df.loc[idx, 'Classified'] = 1
                #print(raw_df)

                accuracy = 100 * len(idx) / instances
                print("Accuracy: ", accuracy, "% on ", instances, " instances")
            else:
                raw_df['Absolute_Error'] = abs(raw_df['Prediction'] -
                                               raw_df['Decision'])
                raw_df['Absolute_Error_Squared'] = raw_df[
                    'Absolute_Error'] * raw_df['Absolute_Error']

                #print(raw_df)

                mae = raw_df['Absolute_Error'].sum() / instances
                print("MAE: ", mae)

                mse = raw_df['Absolute_Error_Squared'].sum() / instances
                rmse = math.sqrt(mse)
                print("RMSE: ", rmse)

                mean = raw_df['Decision'].mean()
                print("Mean: ", mean)

                if mean > 0:
                    print("MAE / Mean: ", 100 * mae / mean, "%")
                    print("RMSE / Mean: ", 100 * rmse / mean, "%")

    return models
Beispiel #45
0
def pool_slice(func,
               data_in,
               args=(),
               kwds={},
               num_workers=None,
               thread_abort=None,
               logfile=None,
               num_batches=1,
               progress=0):
    """
	Process data in slices using a pool of workers and return the results.
	
	The individual worker results are returned in the same order as the
	original input data, irrespective of the order in which the workers
	finished (FIFO).
	
	Progress percentage is written to optional logfile using a background
	thread that monitors a queue.
	Note that 'func' is supposed to periodically check thread_abort.event
	which is passed as the first argument to 'func', and put its progress
	percentage into the queue which is passed as the second argument to 'func'.
	
	"""
    from config import getcfg

    if num_workers is None:
        num_workers = cpu_count()
    num_workers = max(min(int(num_workers), len(data_in)), 1)
    max_workers = getcfg("multiprocessing.max_cpus")
    if max_workers:
        num_workers = min(num_workers, max_workers)

    if num_workers == 1 or not num_batches:
        # Splitting the workload into batches only makes sense if there are
        # multiple workers
        num_batches = 1

    chunksize = float(len(data_in)) / (num_workers * num_batches)
    if chunksize < 1:
        num_batches = 1
        chunksize = float(len(data_in)) / num_workers

    if num_workers > 1:
        Pool = NonDaemonicPool
        manager = mp.Manager()
        if thread_abort is not None and not isinstance(thread_abort.event,
                                                       mp.managers.EventProxy):
            # Replace the event with a managed instance that is compatible
            # with pool
            event = thread_abort.event
            thread_abort.event = manager.Event()
            if event.is_set():
                thread_abort.event.set()
        else:
            event = None
        Queue = manager.Queue
    else:
        # Do it all in in the main thread of the current instance
        Pool = FakePool
        manager = None
        Queue = FakeQueue

    if thread_abort is not None:
        thread_abort_event = thread_abort.event
    else:
        thread_abort_event = None

    progress_queue = Queue()

    if logfile:

        def progress_logger(num_workers, progress=0.0):
            eof_count = 0
            prevperc = -1
            while progress < 100 * num_workers:
                try:
                    inc = progress_queue.get(True, 0.1)
                    if isinstance(inc, Exception):
                        raise inc
                    progress += inc
                except Empty:
                    continue
                except IOError:
                    break
                except EOFError:
                    eof_count += 1
                    if eof_count == num_workers:
                        break
                perc = round(progress / num_workers)
                if perc > prevperc:
                    logfile.write("\r%i%%" % perc)
                    prevperc = perc

        threading.Thread(target=progress_logger,
                         args=(num_workers * num_batches,
                               progress * num_workers * num_batches),
                         name="ProcessProgressLogger").start()

    pool = Pool(num_workers)
    results = []
    start = 0
    for batch in range(num_batches):
        for i in range(batch * num_workers, (batch + 1) * num_workers):
            end = int(math.ceil(chunksize * (i + 1)))
            results.append(
                pool.apply_async(
                    WorkerFunc(func, batch == num_batches - 1),
                    (data_in[start:end], thread_abort_event, progress_queue) +
                    args, kwds))
            start = end

    # Get results
    exception = None
    data_out = []
    for result in results:
        result = result.get()
        if isinstance(result, Exception):
            exception = result
            continue
        data_out.append(result)

    pool.close()
    pool.join()

    if manager:
        # Need to shutdown manager so it doesn't hold files in use
        if event:
            # Restore original event
            if thread_abort.event.is_set():
                event.set()
            thread_abort.event = event
        manager.shutdown()

    if exception:
        raise exception

    return data_out
Beispiel #46
0
def build_variable(type: str, ds: DesignSpaceDocument) -> None:
    output = Path("fonts/ttf")

    if type == "latin":
        for instance in ds.instances:
            instance.name = instance.name.replace("Code", "Code Latin")
            instance.familyName = instance.familyName.replace(
                "Code", "Code Latin")
            if instance.styleMapFamilyName:
                instance.styleMapFamilyName = str(
                    instance.styleMapFamilyName).replace("Code", "Code Latin")
        varFont = ufo2ft.compileVariableTTF(ds)
        styleSpace = statmake.classes.Stylespace.from_file(
            "sources/Latin_STAT.plist")
        statmake.lib.apply_stylespace_to_variable_font(styleSpace, varFont, {})
        DSIG_modification(varFont)

        varFont["name"].setName("Mplus Code Latin", 1, 3, 1, 1033)
        varFont["name"].setName("UFDN;MplusCodeLatin-Regular", 3, 3, 1, 1033)
        varFont["name"].setName("Mplus Code Latin Regular", 4, 3, 1, 1033)
        varFont["name"].setName("MplusCodeLatin-Regular", 6, 3, 1, 1033)

        varFont.save(output / "MplusCodeLatin[wdth,wght].ttf")
        autohint(output / "MplusCodeLatin[wdth,wght].ttf")
        prefix = "MplusCodeLatin"

    if type == "one" or type == "two":
        print("[MPLUS " + type + "] Importing Kanji")
        for source in ds.sources:
            if "{" not in source.name:
                step_merge_glyphs_from_ufo(
                    Path("sources/M+1p-" + source.filename[7:-4] + ".ufo"),
                    source.font)
            source.font.features.text = Path(
                "sources/features.fea").read_text()

        print("[MPLUS " + type + "] Importing Kanji replacement rules")
        kanji_ds = DesignSpaceDocument.fromfile(
            "sources/MPLUS-Kanji.designspace")
        for rule in kanji_ds.rules:
            ds.rules.append(rule)

        print("[MPLUS " + type + "] Building")
        varFont = ufo2ft.compileVariableTTF(ds)
        styleSpace = statmake.classes.Stylespace.from_file(
            "sources/MPLUS_STAT.plist")
        statmake.lib.apply_stylespace_to_variable_font(styleSpace, varFont, {})
        DSIG_modification(varFont)

        print("[MPLUS " + type + "] Saving")
        if type == "one":
            varFont.save(output / "Mplus1[wght].ttf")
            autohint(output / "Mplus1[wght].ttf")
            prefix = "Mplus1"
        elif type == "two":
            varFont.save(output / "Mplus2[wght].ttf")
            autohint(output / "Mplus2[wght].ttf")
            prefix = "Mplus2"

    if type == "code":

        for instance in ds.instances:
            instance.name = instance.name.replace("Mplus", "Mplus 1 ")
            instance.familyName = instance.familyName.replace(
                "Mplus", "Mplus 1 ")
            if instance.styleMapFamilyName:
                instance.styleMapFamilyName = instance.styleMapFamilyName.replace(
                    "MplusCode", "Mplus 1 Code")

        print("[MPLUS " + type + "] Importing glyphs")
        for source in ds.sources:
            if "{" not in source.name:
                step_merge_glyphs_from_ufo(
                    Path("sources/Mplus1-" + str(source.name).split(" ")[2] +
                         ".ufo"), source.font, "sources/kana_glyphs.txt")

                step_merge_glyphs_from_ufo(
                    Path("sources/M+1p-" + str(source.name).split(" ")[2] +
                         ".ufo"), source.font)
            source.name = source.name.replace("Mplus", "Mplus 1")
            source.font.features.text = Path("sources/code.fea").read_text()

        print("[MPLUS " + type + "] Importing Kanji replacement rules")
        kanji_ds = DesignSpaceDocument.fromfile(
            "sources/MPLUS-Kanji.designspace")
        for rule in kanji_ds.rules:
            ds.rules.append(rule)

        print("[MPLUS " + type + "] Building")
        varFont = ufo2ft.compileVariableTTF(ds)
        styleSpace = statmake.classes.Stylespace.from_file(
            "sources/MPLUS_STAT.plist")
        statmake.lib.apply_stylespace_to_variable_font(styleSpace, varFont, {})
        DSIG_modification(varFont)

        varFont["name"].setName("Mplus 1 Code", 1, 3, 1, 1033)
        varFont["name"].setName("UFDN;Mplus1Code-Regular", 3, 3, 1, 1033)
        varFont["name"].setName("Mplus 1 Code Regular", 4, 3, 1, 1033)
        varFont["name"].setName("Mplus1Code-Regular", 6, 3, 1, 1033)

        print("[MPLUS " + type + "] Saving")
        varFont.save(output / "Mplus1Code[wght].ttf")
        autohint(output / "Mplus1Code[wght].ttf")
        prefix = "Mplus1Code"

    generator = fontmake.instantiator.Instantiator.from_designspace(ds)

    pool = multiprocessing.pool.Pool(processes=multiprocessing.cpu_count())
    processes = []

    for instance_descriptor in ds.instances:  # GOTTA GO FAST
        processes.append(
            pool.apply_async(
                make_static,
                (instance_descriptor, generator, prefix),
            ))

    pool.close()
    pool.join()
    for process in processes:
        process.get()
    del processes, pool
Beispiel #47
0
    def filter(self, items: Iterable[Any]) -> Iterable[Any]:

        if len(self._filters) == 0:
            return items

        try:
            with Manager() as manager:

                stdout_queue = manager.Queue()  #type: ignore
                stdlog_queue = manager.Queue()  #type: ignore

                stdout_writer, stdout_reader = QueueSink(
                    stdout_queue), QueueSource(stdout_queue)
                stdlog_writer, stdlog_reader = QueueSink(
                    stdlog_queue), QueueSource(stdlog_queue)

                class MyPool(multiprocessing.pool.Pool):

                    _missing_error_definition_error_is_new = True

                    def _join_exited_workers(self):

                        for worker in self._pool:
                            if worker.exitcode == 1000 and MyPool._missing_error_definition_error_is_new:
                                #this is a hack... This only works so long as we just
                                #process one job at a time... This is true in our case.
                                #this is necessary because multiprocessing can get stuck
                                #waiting for failed workers and that is frustrating for users.

                                MyPool._missing_error_definition_error_is_new = False

                                message = (
                                    "Coba attempted to evaluate your benchmark in multiple processes but the pickle module was unable to "
                                    "find all the definitions needed to pass the tasks to the processes. The two most common causes of "
                                    "this error are: 1) a learner or simulation is defined in a Jupyter Notebook cell or 2) a necessary "
                                    "class definition exists inside the `__name__=='__main__'` code block in the main execution script. In "
                                    "either case there are two simple solutions: 1) evalute your benchmark in a single processed with no "
                                    "limit on child tasks or 2) define all you classes in a separate python file that is imported when "
                                    "evaluating.")

                                CobaConfig.Logger.log(message)

                            if worker.exitcode is not None and worker.exitcode != 0:
                                #A worker exited in an uncontrolled manner and was unable to clean its job
                                #up. We therefore mark one of the jobs as "finished" but failed to prevent an
                                #infinite wait on a failed job to finish that is actually no longer running.
                                list(self._cache.values())[0]._set(
                                    None, (False, None))

                        return super()._join_exited_workers()

                with MyPool(self._processes,
                            maxtasksperchild=self._maxtasksperchild) as pool:

                    # handle not picklable (this is handled by done_or_failed)
                    # handle empty list (this is done by checking result.ready())
                    # handle exceptions in process (unhandled exceptions can cause children to hang so we pass them to stderr)
                    # handle ctrl-c without hanging
                    #   > don't call result.get when KeyboardInterrupt has been hit
                    #   > handle EOFError,BrokenPipeError errors with queue since ctr-c kills manager
                    # handle AttributeErrors. These occure when... (this is handled by shadowing several pool methods)
                    #   > a class that is defined in a Jupyter Notebook cell is pickled
                    #   > a class that is defined inside the __name__=='__main__' block is pickeled
                    # handle Benchmark.evaluate not being called inside of __name__=='__main__' (this is handled by a big try/catch)

                    def done_or_failed(results_or_exception=None):
                        #This method is called one time at the completion of map_async
                        #in the case that one of our jobs threw an exception the argument
                        #will contain an exception otherwise it will be the returned results
                        #of all the jobs. This method is executed on a thread in the Main context.

                        if isinstance(results_or_exception, Exception):
                            from coba.config import CobaConfig

                            if "Can't pickle" in str(
                                    results_or_exception) or "Pickling" in str(
                                        results_or_exception):

                                message = (
                                    str(results_or_exception) +
                                    ". Coba attempted to process your Benchmark on multiple processes and "
                                    "the named class was not able to be pickled. This problem can be fixed in one of two ways: 1) "
                                    "evaluate the benchmark in question on a single process with no limit on the tasks per child or 2) "
                                    "modify the named class to be picklable. The easiest way to make the given class picklable is to "
                                    "add `def __reduce__ (self) return (<the class in question>, (<tuple of constructor arguments>))` to "
                                    "the class. For more information see https://docs.python.org/3/library/pickle.html#object.__reduce__."
                                )

                                CobaConfig.Logger.log(message)
                            else:
                                CobaConfig.Logger.log_exception(
                                    results_or_exception)

                        stdout_writer.write([None])
                        stdlog_writer.write([None])

                    log_thread = Thread(target=Pipe.join(
                        stdlog_reader, [], CobaConfig.Logger.sink).run)
                    log_thread.daemon = True
                    log_thread.start()

                    processor = MultiprocessFilter.Processor(
                        self._filters, stdout_writer, stdlog_writer,
                        self._processes)
                    result = pool.map_async(processor.process,
                                            items,
                                            callback=done_or_failed,
                                            error_callback=done_or_failed,
                                            chunksize=1)

                    # When items is empty finished_callback will not be called and we'll get stuck waiting for the poison pill.
                    # When items is empty ready() will be true immediately and this check will place the poison pill into the queues.
                    if result.ready(): done_or_failed()

                    try:
                        for item in stdout_reader.read():
                            yield item
                        pool.close()
                    except (KeyboardInterrupt, Exception):
                        try:
                            pool.terminate()
                        except:
                            pass
                        raise
                    finally:
                        pool.join()
                        log_thread.join()

        except RuntimeError as e:
            #This happens when importing main causes this code to run again
            raise CobaFatal(str(e))
Beispiel #48
0
def preprocess_batch(tls_type='webster'):
    # Preprocess is a run with some presets
    # Read script arguments from run.config file.

    data = {}
    if do_preprocess():
        num_processors, num_runs, seeds = config_parser.parse_run_params(print_params=False)

        if len(seeds) != num_runs:
            raise configparser.Error('Number of seeds in run.config `seeds`'
                            ' must match the number of runs (`num_runs`) argument.')

        # Assess total number of processors.
        processors_total = mp.cpu_count()
        print(f'Total number of processors available: {processors_total}\n')

        # Adjust number of processors.
        if num_processors > processors_total:
            num_processors = processors_total
            print(f'Number of processors downgraded to {num_processors}\n')
        # num_processors should be <= num_runs
        seeds = seeds[:num_processors]

        print('Arguments (preprocess):')
        print('-----------------------')
        print(f'Number of runs: {num_runs}')
        print(f'Number of processors: {num_processors}')
        print(f'Number of preprocess trails: {num_processors}\n')


        # Read train.py arguments from train.config file.
        preprocess_config = configparser.ConfigParser()
        preprocess_path = CONFIG_PATH / 'train.config'
        preprocess_config.read(str(preprocess_path))

        # Setup sumo-tls-type.
        preprocess_config.set('train_args', 'tls_type', tls_type)
        preprocess_config.set('train_args', 'experiment_save_agent', str(False))
        preprocess_config.set('mdp_args', 'discretize_state_space', str(False))

        # Get feature & network information
        network = preprocess_config.get('train_args', 'network')
        features = eval(preprocess_config.get('mdp_args', 'features'))

        if eval(preprocess_config.get('mdp_args', 'time_period')) is not None:
            features = ('time',) + features

        # Remove lag from features
        features = tuple(rmlag(f) for f in features)

        # Override train configurations with test parameters.
        test_config = configparser.ConfigParser()
        test_path = CONFIG_PATH / 'test.config'
        test_config.read(test_path.as_posix())

        horizon = int(test_config.get('test_args', 'rollout-time'))
        preprocess_config.set('train_args', 'experiment_time', str(horizon))

        # Write .xml files for test plots creation.
        preprocess_config.set('train_args', 'sumo_emission', str(False))

        timestamp = datetime.now().strftime('%Y%m%d%H%M%S.%f')
        print(f'Experiment timestamp: {timestamp}')

        with tempfile.TemporaryDirectory() as tmp_dir:
            # Create a config file for each train.py
            # with the respective seed. These config
            # files are stored in a temporary directory.
            tmp_path = Path(tmp_dir)
            preprocess_configs = []
            for seed in seeds:

                cfg_path = tmp_path / f'{tls_type}-{seed}.config'
                preprocess_configs.append(cfg_path)

                # Setup train seed.
                preprocess_config.set("train_args", "experiment_seed", str(seed + 1))

                # Write temporary train config file.
                with cfg_path.open('w') as ft:
                    preprocess_config.write(ft)

            # rvs: directories' names holding experiment data
            if num_processors > 1:
                ind = range(num_processors)
                cfgs = preprocess_configs
                packed_args = zip(ind, cfgs)
                pool = NonDaemonicPool(num_processors)
                rvs = pool.map(delay_preprocess, packed_args)
                pool.close()
                pool.join()
            else:
                rvs = []
                for cfg in preprocess_configs:
                    rvs.append(delay_preprocess((0.0, cfg)))


        data = defaultdict(list)
        for ret in rvs:
            data[(network, features)] += ret['observation_spaces']
        data = digitize2(data)

    return data
Beispiel #49
0
    def Search(self,
               date,
               table='events',
               coverage=False,
               translation=False,
               output=None,
               queryTime=datetime.datetime.now().strftime('%m-%d-%Y %H:%M:%S'),
               normcols=False):
        """Core searcher method to set parameters for GDELT data searches

        Keyword arguments
        ----------
        date : str, required
            The string representation of a datetime (single) or date
            range (list of strings) that is (are) the targeted timelines to
            pull GDELT data.

        table : string,{'events','gkg','mentions'}
            Select from the table formats offered by the GDELT service:

                * events (1.0 and 2.0)

                    The biggest difference between 1.0 and 2.0 are the
                    update frequencies.  1.0 data is disseminated daily,
                    and the most recent data will be published at 6AM
                    Eastern Standard time of the next day. So, 21 August 2016
                    results would be available 22 August 2016 at 6AM EST.  2.0
                    data updates every 15 minutes of the current day.


                    Version 1.0  runs from January 1, 1979 through March 31,
                    2013 contains 57 fields for each record. The Daily
                    Updates  collection, which begins April 1, 2013 and runs
                    through present, contains an additional field at the end
                    of each record, for a total of 58 fields for each
                    record. The format is dyadic CAMEO format, capturing two
                    actors and the action performed by Actor1 upon Actor2.

                    Version 2.0 only covers February 19, 2015 onwards,
                    and is stored in an expanded version of the dyadic CAMEO
                    format .  See
                    http://data.gdeltproject.org/documentation/GDELT-Event_
                    Codebook-V2.0.pdf for more information.

                * gkg  (1.0 and 2.0)

                    **Warning** These tables and queries can be extremely
                    large and consume a lot of RAM. Consider running a
                    single days worth of gkg pulls, store to disc,
                    flush RAM, then proceed to the next day.

                    Table that represents all of the latent dimensions,
                    geography, and network structure of the global news. It
                    applies an array of highly sophisticated natural language
                    processing algorithms to each document to compute a range
                    of codified metadata encoding key latent and contextual
                    dimensions of the document.  Version 2.0 includes Global
                    Content Analysis Measures (GCAM) which reportedly
                    provides 24 emotional measurement packages that assess
                    more than 2,300 emotions and themes from every article
                    in realtime, multilingual  dimensions natively assessing
                    the emotions of 15 languages (Arabic, Basque, Catalan,
                    Chinese, French, Galician, German, Hindi, Indonesian,
                    Korean, Pashto, Portuguese, Russian, Spanish,
                    and Urdu).See documentation about GKG
                    1.0 at http://data.gdeltproject.org/documentation/GDELT-
                    Global_Knowledge_Graph_Codebook.pdf, and GKG 2.0 at http://
                    data.gdeltproject.org/documentation/GDELT-Global_Knowledge_
                    Graph_Codebook-V2.1.pdf.

                * mentions  (2.0 only)

                     Mentions table records every mention
                     of an event over time, along with the timestamp the
                     article was published. This allows the progression of
                     an event   through the global media to be tracked,
                     identifying  outlets that tend to break certain kinds
                     of events the  earliest or which may break stories
                     later but are more  accurate in their reporting on
                     those events. Combined  with the 15 minute update
                     resolution and GCAM, this also  allows the emotional
                     reaction and resonance of an event to be assessed as
                     it sweeps through the world’s media.

        coverage : bool, default: False
            When set to 'True' and the GDELT version parameter is set to 2,
            gdeltPyR will pull back every 15 minute interval in the day (
            full results) or, if pulling for the current day, pull all 15
            minute intervals up to the most recent 15 minute interval of the
            current our.  For example, if the current date is 22 August,
            2016 and the current time is 0828 HRs Eastern, our pull would
            get pull every 15 minute interval in the day up to 0815HRs.
            When coverate is set to true and a date range is entered,
            we pull every 15 minute interval for historical days and up to
            the most recent 15 minute interval for the current day, if that
            day is included.
            
        translation : bool, default: False
            Whether or not to pull the translation database available from
            version 2 of GDELT. If translation is True, the translated set
            is downloaded, if set to False the english set is downloaded. 

        queryTime : datetime object, system generated
            This records the system time when gdeltPyR's query was executed,
            which can be used for logging purposes.

        output : string, {None,'df','gpd','shp','shapefile', 'json', 'geojson'
                'r','geodataframe'}
            Select the output format for the returned GDELT data

            Options
            -------

            json - Javascript Object Notation output; returns list of
            dictionaries in Python or a list of json objects

            r - writes the cross language dataframe to the current directory.
            This uses the Feather library found at https://github.com/wesm/
            feather.  This option returns a pandas dataframe but write the R
            dataframe to the current working directory. The filename
            includes all the parameters used to launch the query: version,
            coverage, table name, query dates, and query time.

            csv- Outputs a CSV format; all dates and columns are joined
            
            shp- Writes an ESRI shapefile to current directory or path; output
            is filtered to exclude rows with no latitude or longitude
            
            geojson- 
            
            geodataframe- Returns a geodataframe; output is filtered to exclude
            rows with no latitude or longitude.  This output can be manipulated
            for geoprocessing/geospatial operations such as reprojecting the 
            coordinates, creating a thematic map (choropleth map), merging with
            other geospatial objects, etc.  See http://geopandas.org/ for info.

        normcols : bool
            Applies a generic lambda function to normalize GDELT columns 
            for compatibility with SQL or Shapefile outputs.  
        Examples
        --------
        >>> from gdelt
        >>> gd = gdelt.gdelt(version=1)
        >>> results = gd.Search(['2016 10 19'],table='events',coverage=True)
        >>> print(len(results))
        244767
        >>> gd = gdelt.gdelt(version=2)
        >>> results = gd.Search(['2016 Oct 10'], table='gkg')
        >>> print(len(results))
        2398
        >>> print(results.V2Persons.ix[2])
        Juanita Broaddrick,1202;Monica Lewinsky,1612;Donald Trump,12;Donald
        Trump,244;Wolf Blitzer,1728;Lucianne Goldberg,3712;Linda Tripp,3692;
        Bill Clinton,47;Bill Clinton,382;Bill Clinton,563;Bill Clinton,657;Bill
         Clinton,730;Bill Clinton,1280;Bill Clinton,2896;Bill Clinton,3259;Bill
          Clinton,4142;Bill Clinton,4176;Bill Clinton,4342;Ken Starr,2352;Ken
          Starr,2621;Howard Stern,626;Howard Stern,4286;Robin Quivers,4622;
          Paula Jones,3187;Paula Jones,3808;Gennifer Flowers,1594;Neil Cavuto,
          3362;Alicia Machado,1700;Hillary Clinton,294;Hillary Clinton,538;
          Hillary Clinton,808;Hillary Clinton,1802;Hillary Clinton,2303;Hillary
           Clinton,4226
        >>> results = gd.Search(['2016 Oct 10'], table='gkg',output='r')

        Notes
        ------
        Read more about GDELT data at http://gdeltproject.org/data.html

        gdeltPyR retrieves Global Database of Events, Language, and Tone
        (GDELT) data (version 1.0 or version 2.0) via parallel HTTP GET
        requests and is an alternative to accessing GDELT
        data via Google BigQuery.

        Performance will vary based on the number of available cores
        (i.e. CPUs), internet connection speed, and available RAM. For
        systems with limited RAM, Later iterations of gdeltPyR will include
        an option to store the output directly to disc.

        """

        # check for valid table names; fail early
        valid = ['events', 'gkg', 'vgkg', 'iatv', 'mentions']
        if table not in valid:
            raise ValueError(
                'You entered "{}"; this is not a valid table name.'
                ' Choose from "events", "mentions", or "gkg".'.format(table))

        _date_input_check(date, self.version)
        self.coverage = coverage
        self.date = date
        version = self.version
        baseUrl = self.baseUrl
        self.queryTime = queryTime
        self.table = table
        self.translation = translation
        self.datesString = _gdeltRangeString(_dateRanger(self.date),
                                             version=version,
                                             coverage=self.coverage)

        #################################
        # R dataframe check; fail early
        #################################
        if output == 'r':  # pragma: no cover
            try:
                import feather

            except ImportError:
                raise ImportError(('You need to install `feather` in order '
                                   'to output data as an R dataframe. Keep '
                                   'in mind the function will return a '
                                   'pandas dataframe but write the R '
                                   'dataframe to your current working '
                                   'directory as a `.feather` file.  Install '
                                   'by running\npip install feather\nor if '
                                   'you have Anaconda (preferred)\nconda '
                                   'install feather-format -c conda-forge\nTo '
                                   'learn more about the library visit https:/'
                                   '/github.com/wesm/feather'))

        ##################################
        # Partial Functions
        #################################

        v1RangerCoverage = partial(_gdeltRangeString, version=1, coverage=True)
        v2RangerCoverage = partial(_gdeltRangeString, version=2, coverage=True)
        v1RangerNoCoverage = partial(_gdeltRangeString,
                                     version=1,
                                     coverage=False)
        v2RangerNoCoverage = partial(_gdeltRangeString,
                                     version=2,
                                     coverage=False)
        urlsv1gkg = partial(_urlBuilder, version=1, table='gkg')
        urlsv2mentions = partial(_urlBuilder,
                                 version=2,
                                 table='mentions',
                                 translation=self.translation)
        urlsv2events = partial(_urlBuilder,
                               version=2,
                               table='events',
                               translation=self.translation)
        urlsv1events = partial(_urlBuilder, version=1, table='events')
        urlsv2gkg = partial(_urlBuilder,
                            version=2,
                            table='gkg',
                            translation=self.translation)

        eventWork = partial(_mp_worker, table='events', proxies=self.proxies)
        codeCams = partial(_cameos, codes=codes)

        #####################################
        # GDELT Version 2.0 Headers
        #####################################

        if int(self.version) == 2:
            ###################################
            # Download 2.0 Headers
            ###################################

            if self.table == 'events':
                try:
                    self.events_columns = \
                    pd.read_csv(os.path.join(BASE_DIR, "data", 'events2.csv'))[
                        'name'].values.tolist()

                except:  # pragma: no cover
                    self.events_columns = _events2Heads()

            elif self.table == 'mentions':
                try:
                    self.mentions_columns = \
                        pd.read_csv(
                            os.path.join(BASE_DIR, "data", 'mentions.csv'))[
                            'name'].values.tolist()

                except:  # pragma: no cover
                    self.mentions_columns = _mentionsHeads()
            else:
                try:
                    self.gkg_columns = \
                        pd.read_csv(
                            os.path.join(BASE_DIR, "data", 'gkg2.csv'))[
                            'name'].values.tolist()

                except:  # pragma: no cover
                    self.gkg_columns = _gkgHeads()

        #####################################
        # GDELT Version 1.0 Analytics, Header, Downloads
        #####################################

        if int(self.version) == 1:

            if self.table is "mentions":
                raise ValueError('GDELT 1.0 does not have the "mentions"'
                                 ' table. Specify the "events" or "gkg"'
                                 'table.')
            if self.translation:
                raise ValueError('GDELT 1.0 does not have an option to'
                                 ' return translated table data. Switch to '
                                 'version 2 by reinstantiating the gdelt '
                                 'object with <gd = gdelt.gdelt(version=2)>')
            else:
                pass

            try:
                self.events_columns = \
                    pd.read_csv(os.path.join(BASE_DIR, "data", 'events1.csv'))[
                        'name'].values.tolist()

            except:  # pragma: no cover
                self.events_columns = _events1Heads()

            columns = self.events_columns

            if self.table == 'gkg':
                self.download_list = (urlsv1gkg(
                    v1RangerCoverage(_dateRanger(self.date))))

            elif self.table == 'events' or self.table == '':

                if self.coverage is True:  # pragma: no cover

                    self.download_list = (urlsv1events(
                        v1RangerCoverage(_dateRanger(self.date))))

                else:
                    # print("I'm here at line 125")
                    self.download_list = (urlsv1events(
                        v1RangerNoCoverage(_dateRanger(self.date))))

            else:  # pragma: no cover
                raise Exception('You entered an incorrect table type for '
                                'GDELT 1.0.')
        #####################################
        # GDELT Version 2.0 Analytics and Download
        #####################################
        elif self.version == 2:

            if self.table == 'events' or self.table == '':
                columns = self.events_columns
                if self.coverage is True:  # pragma: no cover

                    self.download_list = (urlsv2events(
                        v2RangerCoverage(_dateRanger(self.date))))
                else:

                    self.download_list = (urlsv2events(
                        v2RangerNoCoverage(_dateRanger(self.date))))

            if self.table == 'gkg':
                columns = self.gkg_columns
                if self.coverage is True:  # pragma: no cover

                    self.download_list = (urlsv2gkg(
                        v2RangerCoverage(_dateRanger(self.date))))
                else:
                    self.download_list = (urlsv2gkg(
                        v2RangerNoCoverage(_dateRanger(self.date))))
                    # print ("2 gkg", urlsv2gkg(self.datesString))

            if self.table == 'mentions':
                columns = self.mentions_columns
                if self.coverage is True:  # pragma: no cover

                    self.download_list = (urlsv2mentions(
                        v2RangerCoverage(_dateRanger(self.date))))

                else:

                    self.download_list = (urlsv2mentions(
                        v2RangerNoCoverage(_dateRanger(self.date))))

        #########################
        # DEBUG Print Section
        #########################

        # if isinstance(self.datesString,str):
        #     if parse(self.datesString) < datetime.datetime.now():
        #         self.datesString = (self.datesString[:8]+"234500")
        # elif isinstance(self.datesString,list):
        #     print("it's a list")
        # elif isinstance(self.datesString,np.ndarray):
        #     print("it's an array")
        # else:
        #     print("don't know what it is")
        # print (self.version,self.download_list,self.date, self.table, self.coverage, self.datesString)
        #
        # print (self.download_list)
        # if self.coverage:
        #     coverage = 'True'
        # else:
        #     coverage = 'False'
        # if isinstance(self.date, list):
        #
        #     formattedDates = ["".join(re.split(' |-|;|:', l)) for l in
        #                       self.date]
        #     path = formattedDates
        #     print("gdeltVersion_" + str(self.version) +
        #           "_coverage_" + coverage + "_" +
        #           "_table_" + self.table + '_queryDates_' +
        #           "_".join(path) +
        #           "_queryTime_" +
        #           datetime.datetime.now().strftime('%m-%d-%YT%H%M%S'))
        # else:
        #     print("gdeltVersion_" + str(self.version) +
        #           "_coverage_" + coverage + "_" +
        #           "_table_" + self.table + '_queryDates_' +
        #           "".join(re.split(' |-|;|:', self.date)) +
        #           "_queryTime_" +
        #           datetime.datetime.now().strftime('%m-%d-%YT%H%M%S'))

        #########################
        # Download section
        #########################
        # print(self.download_list,type(self.download_list))

        # from gdelt.extractors import normalpull
        # e=ProcessPoolExecutor()
        # if isinstance(self.download_list,list) and len(self.download_list)==1:
        #     from gdelt.extractors import normalpull
        #
        #     results=normalpull(self.download_list[0],table=self.table)
        # elif isinstance(self.download_list,list):
        #     print(table)
        #     multilist = list(e.map(normalpull,self.download_list))
        #     results = pd.concat(multilist)
        # print(results.head())

        if isinstance(self.datesString, str):
            if self.table == 'events':

                results = eventWork(self.download_list)
            else:
                # if self.table =='gkg':
                #     results = eventWork(self.download_list)
                #
                # else:
                results = _mp_worker(self.download_list, proxies=self.proxies)

        else:

            if self.table == 'events':

                pool = Pool(processes=cpu_count())
                downloaded_dfs = list(
                    pool.imap_unordered(eventWork, self.download_list))
            else:

                pool = NoDaemonProcessPool(processes=cpu_count())
                downloaded_dfs = list(
                    pool.imap_unordered(
                        _mp_worker,
                        self.download_list,
                    ))
            pool.close()
            pool.terminate()
            pool.join()
            # print(downloaded_dfs)
            results = pd.concat(downloaded_dfs)
            del downloaded_dfs
            results.reset_index(drop=True, inplace=True)

        if self.table == 'gkg' and self.version == 1:
            results.columns = results.ix[0].values.tolist()
            results.drop([0], inplace=True)
            columns = results.columns

        # check for empty dataframe
        if results is not None:
            if len(results.columns) == 57:  # pragma: no cover
                results.columns = columns[:-1]

            else:
                results.columns = columns

        # if dataframe is empty, raise error
        elif results is None or len(results) == 0:  # pragma: no cover
            raise ValueError("This GDELT query returned no data. Check "
                             "query parameters and "
                             "retry")

        # Add column of human readable codes; need updated CAMEO
        if self.table == 'events':
            cameoDescripts = results.EventCode.apply(codeCams)

            results.insert(27,
                           'CAMEOCodeDescription',
                           value=cameoDescripts.values)

        ###############################################
        # Setting the output options
        ###############################################

        # dataframe output
        if output == 'df':
            self.final = results

        # json output
        elif output == 'json':
            self.final = results.to_json(orient='records')

        # csv output
        elif output == 'csv':
            self.final = results.to_csv(encoding='utf-8')

        # geopandas dataframe output
        elif output == 'gpd' or output == 'geodataframe' or output == 'geoframe':
            self.final = _geofilter(results)
            self.final = self.final[self.final.geometry.notnull()]

        # r dataframe output
        elif output == 'r':  # pragma: no cover
            if self.coverage:
                coverage = 'True'
            else:
                coverage = 'False'
            if isinstance(self.date, list):

                formattedDates = [
                    "".join(re.split(' |-|;|:', l)) for l in self.date
                ]
                path = formattedDates
                outPath = (
                    "gdeltVersion_" + str(self.version) + "_coverage_" +
                    coverage + "_" + "_table_" + self.table + '_queryDates_' +
                    "_".join(path) + "_queryTime_" +
                    datetime.datetime.now().strftime('%m-%d-%YT%H%M%S') +
                    ".feather")
            else:
                outPath = (
                    "gdeltVersion_" + str(self.version) + "_coverage_" +
                    coverage + "_" + "_table_" + self.table + '_queryDates_' +
                    "".join(re.split(' |-|;|:', self.date)) + "_queryTime_" +
                    datetime.datetime.now().strftime('%m-%d-%YT%H%M%S') +
                    ".feather")

            if normcols:
                results.columns = list(
                    map(lambda x: (x.replace('_', "")).lower(),
                        results.columns))

            feather.api.write_dataframe(results, outPath)
            return results

        else:
            self.final = results

        #########################
        # Return the result
        #########################

        # normalized columns
        if normcols:
            self.final.columns = list(
                map(lambda x: (x.replace('_', "")).lower(),
                    self.final.columns))

        return self.final
Beispiel #50
0
        pool = multiprocessing.pool.Pool(processes=multiprocessing.cpu_count())
        processes = []

        if args.decol:
            processes.append(pool.apply_async(execute, ("decol", sources)))

        if args.haruno:
            processes.append(pool.apply_async(execute, ("haruno", sources)))

        if args.opti:
            processes.append(pool.apply_async(execute, ("opti", sources)))

        if args.tokumin:
            processes.append(pool.apply_async(execute, ("tokumin", sources)))

        pool.close()
        pool.join()
        for process in processes:
            process.get()
        del processes, pool

    elif args.shared:
        if os.path.isfile(sources / "Kaisei-Shared.glyphs"):
            main(("glyphs2ufo", str(sources / "Kaisei-Shared.glyphs"), "-m",
                  str(sources / "ufo_shared")))
        else:
            print(
                "Cannot locate the 'shared' Glyphs file. Please confirm the file is unzipped."
            )
    else:
        print("No fonts selected for export")
Beispiel #51
0
def _run_next_virtual_nodes(graph, node, globals_, locals_, flags, pool,
                            result):

    operator = graph.node[node].get('OPERATOR', None)

    return_value = []

    not_safe_to_iter = False

    is_head_result = True

    head_result = None

    # "Hello, world" or {...}

    if isinstance(result, (basestring, dict)) or not __isiter(result):

        not_safe_to_iter = True

    # [[1]]

    if isinstance(result, list) and len(result) == 1 and isinstance(
            result[0], list):

        result = result[0]

        not_safe_to_iter = True

    # More nodes ahead?

    if operator:

        if not_safe_to_iter:

            logging.debug('not_safe_to_iter is True for %s' % result)

            head_result = result

            tmp_globals = copy.copy(globals_)

            tmp_locals = copy.copy(locals_)

            tmp_globals['_'] = tmp_locals['_'] = head_result

            return_value = __resolve_and_merge_results(
                _run(graph, node, tmp_globals, tmp_locals, {}, None, True))

        else:

            # Originally this was implemented using result[0] and result[1:] but xrange() is not slice-able, thus, I have changed it to `for` with buffer for 1st result

            for res_value in result:

                logging.debug('Now at %s from %s' % (res_value, result))

                if is_head_result:

                    logging.debug('is_head_result is True for %s' % res_value)

                    is_head_result = False

                    head_result = res_value

                    tmp_globals = copy.copy(globals_)

                    tmp_locals = copy.copy(locals_)

                    tmp_globals['_'] = tmp_locals['_'] = head_result

                    return_value.insert(
                        0,
                        _run(graph, node, tmp_globals, tmp_locals, {}, None,
                             True))

                    continue

                tmp_globals = copy.copy(globals_)

                tmp_locals = copy.copy(locals_)

                tmp_globals['_'] = tmp_locals['_'] = res_value

                # Synchronous

                if operator == '|':

                    return_value.append(
                        pool.apply(_run,
                                   args=(graph, node, tmp_globals, tmp_locals,
                                         {}, None, True)))

                # Asynchronous

                if operator == '->':

                    return_value.append(
                        pool.apply_async(_run,
                                         args=(graph, node, tmp_globals,
                                               tmp_locals, {}, None, True)))

            pool.close()

            pool.join()

            pool.terminate()

            logging.debug('return_value = %s' % return_value)

            return_value = __resolve_and_merge_results(return_value)

    # Loopback

    else:

        # AS IS

        if not_safe_to_iter:

            return_value = [result]

        # Iterate for all possible *return values*

        else:

            for res_value in result:

                return_value.append(res_value)

            # Unbox

            if len(return_value) == 1:

                return_value = return_value[0]

    return return_value
def findDEMFeature(original_dem, index):
    global featureList,maxArea,xy,areaList,indexList,maskBB,maskHeight,maskLabel,maskStd,\
     neighbours,regionbb,mask,regionval,smallerThan,kernel
    height, width = original_dem.shape
    region = regionprops(index, original_dem, cache=True)
    number_regions = len(region)
    for i in range(0, number_regions):
        if region[i].area > 10000:
            areaList.append(region[i].area)
            indexList.append(i)
            maskBB.append(region[i].bbox)
            maskLabel.append(region[i].label)
            maskHeight.append(region[i].mean_intensity)
            xy = region[i].coords
            std = np.std(original_dem[xy[:, 0], xy[:, 1]])
            maskStd.append(std)

    areaList = np.array(areaList)
    indexList = np.array(indexList)
    maskBB = np.array(maskBB)
    maskHeight = np.array(maskHeight)
    maskLabel = np.array(maskLabel)
    maskStd = np.array(maskStd)
    order = np.argsort(-areaList)  #minus for decending
    areaList = areaList[order]
    indexList = indexList[order]
    maskBB = maskBB[order]
    maskHeight = maskHeight[order]
    maskLabel = maskLabel[order]
    maskStd = maskStd[order]

    for regionIndex in range(0, int(len(areaList) / 10)):
        minr, minc, maxr, maxc = maskBB[regionIndex]
        extraMargin = 20
        if minr - extraMargin < 0:
            minr = 0
        else:
            minr = minr - extraMargin
        if minc - extraMargin < 0:
            minc = 0
        else:
            minc = minc - extraMargin
        if maxr + extraMargin > height:
            maxr = height
        else:
            maxr = maxr + extraMargin
        if maxc + extraMargin > width:
            maxc = width
        else:
            maxc = maxc + extraMargin
        regionbb = index[minr:maxr, minc:maxc]
        mask = (regionbb == maskLabel[regionIndex]).astype(np.uint8) * 255
        contours = cv2.findContours(mask, cv2.RETR_TREE,
                                    cv2.CHAIN_APPROX_SIMPLE)[1]

        holeData = []
        if len(contours) - 1 > 0:
            for j in range(0, len(contours) - 1):
                cnt = contours[j + 1]
                pos = cnt[0]
                area = cv2.contourArea(cnt)
                if area > 1000:
                    holeData.append(cv2.contourArea(contours[j + 1]))
            if len(holeData) > 0:
                number_holes = len(holeData)
                holeData = np.sort(holeData)
                avgHole = np.mean(holeData, dtype=np.int)
            else:
                number_holes = avgHole = largestHole = 0

        else:
            number_holes = avgHole = largestHole = 0

        cnt = contours[0]
        hull = cv2.convexHull(cnt, returnPoints=False)
        defects = cv2.convexityDefects(cnt, hull)
        defectData = []
        if defects is not None:
            total_number_defects = len(defects)
            for i in range(defects.shape[0]):
                d = defects[i, 0][3]
                if d > 100000:
                    defectData.append(d)
            if len(defectData) > 0:
                number_defects = len(defectData)
                defectData = np.sort(defectData)
                avgDefect = np.mean(defectData, dtype=np.int)
            else:
                number_defects = avgDefect = 0
        else:
            number_defects = avgDefect = 0
            total_number_defects = 0

        mask2 = cv2.dilate(mask, kernel, iterations=1) > 0
        regionbb = np.multiply(mask2, regionbb)
        neighbours = np.unique(regionbb)
        if neighbours[0] == 0:
            neighbours = neighbours[1:-1]
        removePos = np.where(neighbours == maskLabel[regionIndex])
        neighbours = np.delete(neighbours, removePos)

        neighbours = np.intersect1d(
            neighbours, maskLabel)  #to take only large area adjacent segments
        smallerThan = 0

        if maskStd[regionIndex] > 2:
            regionval = original_dem[minr:maxr, minc:maxc]
            pool = ThreadPool(int(cpu_count()))
            pool.map(findStdSmaller, range(0, len(neighbours)))
            pool.close()
            pool.join()
        else:
            for i in range(0, len(indexList)):
                for j in range(0, len(neighbours)):
                    if maskLabel[i] == neighbours[j]:
                        if maskHeight[i] > maskHeight[regionIndex] + 2:
                            smallerThan = smallerThan + 1

        featureList.append([maskLabel[regionIndex],number_holes,avgHole,\
         number_defects,total_number_defects,smallerThan,int(np.ceil(maskStd[regionIndex]))])

    return np.array(featureList), region
Beispiel #53
0
def eval(source, globals_={}, locals_={}):
    """Evaluate Pythonect code in the context of globals and locals.

    Args:
        source: A string representing a Pythonect code or a networkx.DiGraph() as
            returned by parse()
        globals: A dictionary.
        locals: Any mapping.

    Returns:
        The return value is the result of the evaluated code.

    Raises:
        SyntaxError: An error occurred parsing the code.
    """

    return_value = None

    # Meaningful program?

    if source != "pass":

        logging.info('Program is meaningful')

        return_value = []

        return_values = []

        globals_values = []

        locals_values = []

        tasks = []

        reduces = {}

        logging.debug('Evaluating %s with globals_ = %s and locals_ %s' %
                      (source, globals_, locals_))

        if not isinstance(source, networkx.DiGraph):

            logging.info('Parsing program...')

            graph = parse(source)

        else:

            logging.info('Program is already parsed! Using source AS IS')

            graph = source

        root_nodes = sorted([
            node for node, degree in graph.in_degree().items() if degree == 0
        ])

        if not root_nodes:

            cycles = networkx.simple_cycles(graph)

            if cycles:

                logging.info(
                    'Found cycles: %s in graph, using nodes() 1st node (i.e. %s) as root node'
                    % (cycles, graph.nodes()[0]))

                root_nodes = [graph.nodes()[0]]

        logging.info('There are %d root node(s)' % len(root_nodes))

        logging.debug('Root node(s) are: %s' % root_nodes)

        # Extend Python's __builtin__ with Pythonect's `lang`

        start_globals_ = __extend_builtins(globals_)

        logging.debug('Initial globals_:\n%s' % pprint.pformat(start_globals_))

        # Default input

        start_globals_['_'] = start_globals_.get('_', locals_.get('_', None))

        logging.info('_ equal %s', start_globals_['_'])

        # Execute Pythonect program

        pool = __create_pool(globals_, locals_)

        # N-1

        for root_node in root_nodes[1:]:

            if globals_.get('__IN_EVAL__',
                            None) is None and not _is_referencing_underscore(
                                graph, root_node):

                # Reset '_'

                globals_['_'] = locals_['_'] = None

            if globals_.get('__IN_EVAL__', None) is None:

                globals_['__IN_EVAL__'] = True

            temp_globals_ = copy.copy(globals_)

            temp_locals_ = copy.copy(locals_)

            task_result = pool.apply_async(_run,
                                           args=(graph, root_node,
                                                 temp_globals_, temp_locals_,
                                                 {}, None, False))

            tasks.append((task_result, temp_locals_, temp_globals_))

        # 1

        if globals_.get('__IN_EVAL__',
                        None) is None and not _is_referencing_underscore(
                            graph, root_nodes[0]):

            # Reset '_'

            globals_['_'] = locals_['_'] = None

        if globals_.get('__IN_EVAL__', None) is None:

            globals_['__IN_EVAL__'] = True

        result = _run(graph, root_nodes[0], globals_, locals_, {}, None, False)

        # 1

        for expr_return_value in result:

            globals_values.append(globals_)

            locals_values.append(locals_)

            return_values.append([expr_return_value])

        # N-1

        for (task_result, task_locals_, task_globals_) in tasks:

            return_values.append(task_result.get())

            locals_values.append(task_locals_)

            globals_values.append(task_globals_)

        # Reduce + _PythonectResult Grouping

        for item in return_values:

            # Is there _PythonectResult in item list?

            for sub_item in item:

                if isinstance(sub_item, _PythonectResult):

                    # 1st Time?

                    if sub_item.values['node'] not in reduces:

                        reduces[sub_item.values['node']] = []

                        # Add Place holder to mark the position in the return value list

                        return_value.append(
                            _PythonectLazyRunner(sub_item.values['node']))

                    reduces[sub_item.values['node']] = reduces[
                        sub_item.values['node']] + [sub_item.values]

                else:

                    return_value.append(sub_item)

        # Any _PythonectLazyRunner's?

        if reduces:

            for return_item_idx in xrange(0, len(return_value)):

                if isinstance(return_value[return_item_idx],
                              _PythonectLazyRunner):

                    # Swap list[X] with list[X.go(reduces)]

                    return_value[return_item_idx] = pool.apply_async(
                        return_value[return_item_idx].go,
                        args=(graph, reduces))

            return_value = __resolve_and_merge_results(return_value)

        # [...] ?

        if return_value:

            # Single return value? (e.g. [1])

            if len(return_value) == 1:

                return_value = return_value[0]

            # Update globals_ and locals_

#            globals_, locals_ = __merge_all_globals_and_locals(globals_, locals_, globals_values, {}, locals_values, {})

# Set `return value` as `_`

        globals_['_'] = locals_['_'] = return_value

        if globals_.get('__IN_EVAL__', None) is not None:

            del globals_['__IN_EVAL__']

        pool.close()

        pool.join()

        pool.terminate()

    return return_value
Beispiel #54
0
def processFile(filename):

    ex = detectExtension(filename)
    print("\nStarted processing ", filename[:-len(ex)], " ...")

    workImage = Image.open(("images/" + filename))
    width, height = workImage.size
    preview = Image.new('RGB', (width, height), color='red')

    if (dithering):
        print("Started dithering for " + filename)
        for i in range(width):
            for j in range(height):
                oldPixel = getPixel([i, j], workImage)
                newPixel = getClosestColor(oldPixel, colors)

                setPixel([i, j], workImage, getClosestColor(newPixel, colors))
                quant_error = np.array(oldPixel) - np.array(newPixel)

                if (i < width - 1):
                    setPixel([i + 1, j], workImage,
                             (np.array(getPixel([i + 1, j], workImage)) +
                              (quant_error * (7 / 16))).astype(int))
                if (i > 0 and j < height - 1):
                    setPixel([i - 1, j + 1], workImage,
                             (np.array(getPixel([i - 1, j + 1], workImage)) +
                              (quant_error * (3 / 16))).astype(int))
                if (j < height - 1):
                    setPixel([i, j + 1], workImage,
                             (np.array(getPixel([i, j + 1], workImage)) +
                              (quant_error * (5 / 16))).astype(int))
                if (i < width - 1 and j < height - 1):
                    setPixel([i + 1, j + 1], workImage,
                             (np.array(getPixel([i + 1, j + 1], workImage)) +
                              (quant_error * (1 / 16))).astype(int))
        print("Finished dithering for " + filename)

    inData = []
    seg = width // 6
    for n in range(6):
        inData.append([n, [seg * (n), seg * (n + 1)], workImage, height])
        if (seg * (n + 1) < width and n == 5):
            inData.append([n, [seg * (n + 1), width], workImage, height])

    print("Started assembling function for " + filename)
    if __name__ == '__main__':
        pool = nPool(processThreads)
        output = pool.map(generateCommandItems, inData)
        pool.close()
        pool.join()

    output.sort()

    if (isGif):
        command = '{"function":"set_nbt","tag":"{\\"Items\\":['
        items = ""
        for e in output:
            items += e[1]

        command += items + '],gbundle:\\"start\\"}"}'

        file1 = open(
            "item_modifiers/" +
            (''.join([char for char in filename[:-len(ex)] if char != '\\'
                      ])).lower() + ".json", "w")
        file1.write(command)
        file1.close()

    else:
        items = ""
        for e in output:
            items += e[1]
        command = "give @p bundle{Items:[" + (''.join(
            [char for char in items[:-1] if char != '\\'])) + "]}"
        file1 = open(
            "functions/" +
            (''.join([char for char in filename[:-len(ex)] if char != '\\'
                      ])).lower() + ".mcfunction", "w")
        file1.write(command)
        file1.close()

    print(filename + " command has been generated and saved! Size: " +
          str(len(command)))
    return 1
    def __init__(self,
                 directory,
                 image_data_generator,
                 triplet_path,
                 target_size=(256, 256),
                 color_mode='rgb',
                 classes=None,
                 class_mode='categorical',
                 batch_size=32,
                 shuffle=True,
                 seed=None,
                 data_format=None,
                 save_to_dir=None,
                 save_prefix='',
                 save_format='png',
                 follow_links=False):

        if data_format is None:
            data_format = K.image_data_format()
        self.directory = directory
        self.image_data_generator = image_data_generator
        self.target_size = tuple(target_size)
        if color_mode not in {'rgb', 'grayscale'}:
            raise ValueError('Invalid color mode:', color_mode,
                             '; expected "rgb" or "grayscale".')
        self.color_mode = color_mode
        self.data_format = data_format
        if self.color_mode == 'rgb':
            if self.data_format == 'channels_last':
                self.image_shape = self.target_size + (3, )
            else:
                self.image_shape = (3, ) + self.target_size
        else:
            if self.data_format == 'channels_last':
                self.image_shape = self.target_size + (1, )
            else:
                self.image_shape = (1, ) + self.target_size
        self.classes = classes
        if class_mode not in {
                'categorical', 'binary', 'sparse', 'input', None
        }:
            raise ValueError(
                'Invalid class_mode:', class_mode,
                '; expected one of "categorical", '
                '"binary", "sparse", "input"'
                ' or None.')
        self.class_mode = class_mode
        self.save_to_dir = save_to_dir
        self.save_prefix = save_prefix
        self.save_format = save_format

        white_list_formats = {'png', 'jpg', 'jpeg', 'bmp', 'ppm'}

        # first, count the number of samples and classes
        self.samples = 0

        if not classes:
            classes = []
            for subdir in sorted(os.listdir(directory)):
                if os.path.isdir(os.path.join(directory, subdir)):
                    classes.append(subdir)

        self.num_class = len(classes)
        self.class_indices = dict(zip(classes, range(len(classes))))

        pool = multiprocessing.pool.ThreadPool()
        function_partial = partial(_count_valid_files_in_directory,
                                   white_list_formats=white_list_formats,
                                   follow_links=follow_links)
        self.samples = sum(
            pool.map(function_partial,
                     (os.path.join(directory, subdir) for subdir in classes)))

        print('Found %d images belonging to %d classes.' %
              (self.samples, self.num_class))

        # second, build an index of the images in the different class subfolders
        results = []

        self.filenames = []
        self.classes = np.zeros((batch_size, ), dtype='int32')
        i = 0
        for dirpath in (os.path.join(directory, subdir) for subdir in classes):
            #result = _list_valid_filenames_in_directory(dirpath, white_list_formats, self.class_indices, follow_links, triplet_path)
            #results.append(result)
            results.append(
                pool.apply_async(
                    _list_valid_filenames_in_directory,
                    (dirpath, white_list_formats, self.class_indices,
                     follow_links, triplet_path)))
        for res in results:
            classes, filenames = res.get()
            #self.classes = np.zeros((len(filenames),), dtype='int32')
            #self.classes[i:i + len(classes)] = classes
            self.filenames += filenames
            i += len(classes)
        pool.close()
        pool.join()
        super(DirectoryIterator, self).__init__(self.samples, batch_size,
                                                shuffle, seed, triplet_path)
    def generate_prediction(self, model, verbose=False):
        """Implementation of sciunit.Test.generate_prediction."""

        efel.reset()

        self.observation = collections.OrderedDict(sorted(self.observation.items()))

        global model_name_soma
        model_name_soma = model.name

        pool = multiprocessing.Pool(self.npool, maxtasksperchild=1)

        stimuli_list=self.create_stimuli_list()

        run_stim_ = functools.partial(self.run_stim, model)
        traces_results = pool.map(run_stim_, stimuli_list, chunksize=1)
        #traces_results = traces_result.get()


        pool.terminate()
        pool.join()
        del pool

        pool2 = multiprocessing.Pool(self.npool, maxtasksperchild=1)

        features_names, features_list = self.create_features_list(self.observation)

        analyse_traces_ = functools.partial(self.analyse_traces, stimuli_list, traces_results)
        feature_results = pool2.map(analyse_traces_, features_list, chunksize=1)
        #feature_results = feature_result.get()

        pool2.terminate()
        pool2.join()
        del pool2

        feature_results_dict={}
        for i in range (0,len(feature_results)):
            feature_results_dict.update(feature_results[i])  #concatenate dictionaries

        if self.specify_data_set != '':
            specify_data_set = '_' + self.specify_data_set
        else:
            specify_data_set = self.specify_data_set
        if self.base_directory:
            self.path_results = self.base_directory + 'results/' + 'somaticfeat' + specify_data_set + '/' + model.name + '/'
        else:
            self.path_results = model.base_directory + 'results/' + 'somaticfeat' + specify_data_set + '/'

        try:
            if not os.path.exists(self.path_results):
                os.makedirs(self.path_results)
        except OSError as e:
            if e.errno != 17:
                raise
            pass

        file_name=self.path_results+'soma_features.p'

        SomaFeaturesDict={}
        SomaFeaturesDict['traces_results']=traces_results
        SomaFeaturesDict['features_names']=features_names
        SomaFeaturesDict['feature_results_dict']=feature_results_dict
        SomaFeaturesDict['observation']=self.observation
        if self.save_all:
            pickle.dump(SomaFeaturesDict, gzip.GzipFile(file_name, "wb"))

        plt.close('all') #needed to avoid overlapping of saved images when the test is run on multiple models in a for loop

        self.create_figs(model, traces_results, features_names, feature_results_dict, self.observation)

        #prediction = feature_results_dict

        soma_features={}
        needed_keys = { 'feature mean', 'feature sd'}
        for i in range(len(SomaFeaturesDict['features_names'])):
            feature_name = SomaFeaturesDict['features_names'][i]
            soma_features[feature_name] = { key:value for key,value in list(feature_results_dict[feature_name].items()) if key in needed_keys }

        file_name_json = self.path_results + 'somatic_model_features.json'

        json.dump(soma_features, open(file_name_json, "w"), indent=4)

        prediction=soma_features

        efel.reset()

        return prediction
Beispiel #57
0
    def compute_descriptor_async(self,
                                 data_iter,
                                 descr_factory,
                                 overwrite=False,
                                 procs=None,
                                 **kwds):
        """
        Asynchronously compute feature data for multiple data items.

        :param data_iter: Iterable of data elements to compute features for.
            These must have UIDs assigned for feature association in return
            value.
        :type data_iter: collections.Iterable[smqtk.representation.DataElement]

        :param descr_factory: Factory instance to produce the wrapping
            descriptor element instances.
        :type descr_factory: smqtk.representation.DescriptorElementFactory

        :param overwrite: Whether or not to force re-computation of a descriptor
            vectors for the given data even when there exists precomputed
            vectors in the generated DescriptorElements as generated from the
            provided factory. This will overwrite the persistently stored
            vectors if the provided factory produces a DescriptorElement
            implementation such storage.
        :type overwrite: bool

        :param procs: Optional specification of how many processors to use
            when pooling sub-tasks. If None, we attempt to use all available
            cores.
        :type procs: int | None

        :param pool_type: multiprocessing pool type to use. If no provided, we
            use a normal multiprocessing.pool.Pool instance. By default we use
            the ThreadPool type when None.
        :type pool_type: type | None

        :return: Mapping of input DataElement instances to the computed
            descriptor element.
            DescriptorElement UUID's are congruent with the UUID of the data
            element it is the descriptor of.
        :rtype: dict[smqtk.representation.DataElement,
                     smqtk.representation.DescriptorElement]

        """
        self._log.info("Async compute features")

        # Mapping of DataElement to async processing result
        #: :type: dict[smqtk.representation.DataElement, multiprocessing.pool.ApplyResult]
        ar_map = {}
        # Mapping of DataElement to the DescriptorElement for it.
        #: :type: dict[smqtk.representation.DataElement, smqtk.representation.DescriptorElement]
        de_map = {}

        # Queue up descriptor generation for descriptor elements that
        procs = procs and int(procs)
        pool_t = kwds.get("pool_type", multiprocessing.pool.ThreadPool)
        pool = pool_t(processes=procs)
        with SimpleTimer("Queuing descriptor computation...", self._log.debug):
            for d in data_iter:
                de_map[d] = descr_factory.new_descriptor(self.name, d.uuid())
                if overwrite or not de_map[d].has_vector():
                    ar_map[d] = \
                        pool.apply_async(_async_feature_generator_helper,
                                         args=(self, d))
        pool.close()

        failures = False
        # noinspection PyPep8Naming
        perc_T = 0.0
        perc_inc = 0.1
        with SimpleTimer("Collecting async results...", self._log.debug):
            for i, (d, ar) in enumerate(ar_map.iteritems()):
                descriptor = ar.get()
                if descriptor is None:
                    failures = True
                    continue
                else:
                    de_map[d].set_vector(descriptor)

                perc = float(i + 1) / len(ar_map)
                if perc >= perc_T:
                    self._log.debug("Progress: [%d/%d] %3.3f%%", i + 1,
                                    len(ar_map),
                                    float(i + 1) / (len(ar_map)) * 100)
                    perc_T += perc_inc
        pool.join()

        # Check for failed generation
        if failures:
            raise RuntimeError("Failure occurred during data feature "
                               "computation. See logging.")

        return de_map
Beispiel #58
0
    def classify_async(self,
                       d_iter,
                       factory,
                       overwrite=False,
                       procs=None,
                       use_multiprocessing=False,
                       ri=None):
        """
        Asynchronously classify the DescriptorElements in the given iterable.

        :param d_iter: Iterable of DescriptorElements
        :type d_iter:
            collections.Iterable[smqtk.representation.DescriptorElement]

        :param factory: Classifier element factory to use for element generation
        :type factory: smqtk.representation.ClassificationElementFactory

        :param overwrite: Recompute classification of the input descriptor and
            set the results to the ClassificationElement produced by the
            factory.
        :type overwrite: bool

        :param procs: Explicit number of cores/thread/processes to use.
        :type procs: None | int

        :param use_multiprocessing: Use ``multiprocessing.pool.Pool`` instead of
            ``multiprocessing.pool.ThreadPool``.
        :type use_multiprocessing: bool

        :param ri: Progress reporting interval in seconds. Set to a value > 0 to
            enable. Disabled by default.
        :type ri: float | None

        :return: Mapping of input DescriptorElement instances to the computed
            ClassificationElement. ClassificationElement UUID's are congruent
            with the UUID of the DescriptorElement
        :rtype: dict[smqtk.representation.DescriptorElement,
                     smqtk.representation.ClassificationElement]

        """
        self._log.info("Async classifying descriptors")
        ri = ri and ri > 0 and ri

        # Mapping of DataElement to async processing result
        ar_map = {}
        # Mapping of DescriptorElement to its associated ClassificationElement
        #: :type: dict[smqtk.representation.DescriptorElement, smqtk.representation.ClassificationElement]
        d2c_map = {}

        procs = procs and int(procs)
        if use_multiprocessing:
            pool = multiprocessing.pool.Pool(procs)
        else:
            pool = multiprocessing.pool.ThreadPool(procs)

        self._log.info("Queueing async work")
        i = j = 0
        s = lt = time.time()
        for d in d_iter:
            d2c_map[d] = factory.new_classification(self.name, d.uuid())
            i += 1
            if overwrite or not d2c_map[d].has_classifications():
                ar_map[d] = pool.apply_async(_async_helper_classify,
                                             args=(self, d))
                j += 1

            t = time.time()
            if ri and t - lt >= ri:
                self._log.debug(
                    "-- Scanned = %d :: Queued = %d "
                    "(per second = %f)", i, j, i / (t - s))
                lt = t
        # Close pool input
        pool.close()

        self._log.info("Collecting results")
        failures = False
        s = lt = time.time()
        for i, (d, ar) in enumerate(ar_map.iteritems()):
            c = ar.get()
            if c is None:
                failures = True
                continue
            else:
                d2c_map[d].set_classification(c)

            # progress reporting
            t = time.time()
            if ri and t - lt >= ri:
                self._log.debug("-- Complete = %d "
                                "(per second = %f)", i, i / (t - s))
                lt = t
        pool.join()

        if failures:
            raise RuntimeError("Failure occurred during descriptor "
                               "classification. See logging.")

        return d2c_map
Beispiel #59
0
def test_program(bits_to_flip):
    """
    Flip the specified bits in the initial code and then execute the code in an emulator

    Note: we do some crazy stuff with multiprocessing because Unicorn sometimes SIGABRTs and we need to catch it.

    :param flip_operation: XOR, AND, or OR
    :param bits_to_flip: the location of the bits to flip
    :return:
    """
    global code_initial, flip_operation, result_cache, force_invalid_ins

    logger.debug(flip_operation)

    code_input = flip_bits(code_initial, bytes_to_trash, bits_to_flip,
                           flip_operation)

    # print code_input

    # Is it cached?
    code_str = int(''.join(map(str, code_input)))
    if result_cache is not None and code_str in result_cache:
        return result_cache[code_str]

    logger.debug(code_input)
    # convert list to str for emulator
    system_code = ''.join(map(chr, code_input))

    if force_invalid_ins:
        asm = list(CAPSTONE.disasm(system_code, len(code_input)))
        # logger.info("compiled: {}".format(system_code.encode('hex')))
        if len(asm) == 0:
            logger.warning('>>> \tdisasm failure'.format(code.encode('hex')))
        for ins in asm:
            # print repr(ins.bytes), len(ins.bytes)
            # logger.info(
            #     '>>> {}\t {} {}'.format(binascii.hexlify(ins.bytes),
            #                             ins.mnemonic,
            #                             ins.op_str))
            if ins.bytes == "\x00" * len(ins.bytes) \
                    or ins.bytes == "\xff" * len(ins.bytes):
                logger.debug("Forcing invalid instruction")
                return Result.GLITCH_FAILED_INVALID_INSTRUCTION

    # Create a 1 process pool to execute our emulator in (effectively a sandbox for SIGABRT)
    pool = MyPool(processes=1)

    # Run the emulator
    t = pool.apply_async(run_emulator, (system_code, ))

    # Get the result
    try:
        rtn = t.get(timeout=1)
    except:
        # Sometimes Unicorn will SIGABRT, we need to catch that
        logger.exception("Got a really bad fail!!!")
        rtn = Result.GLITCH_FAILED_SIGABRT
        del t
        # sys.exit(0)

    # Make sure we don't keep making pools
    pool.close()
    pool.terminate()
    pool.join()
    del pool

    if result_cache is not None:
        result_cache[code_str] = rtn
    return rtn
Beispiel #60
0
def baseline_batch():

    flags = get_arguments()

    # Read script arguments from run.config file.

    num_processors, num_runs, seeds = config_parser.parse_run_params(
        print_params=False)

    if len(seeds) != num_runs:
        raise configparser.Error(
            'Number of seeds in run.config `seeds`'
            ' must match the number of runs (`num_runs`) argument.')

    print('Arguments (baseline.py):')
    print('-----------------------')
    print('Number of runs: {0}'.format(num_runs))
    print('Number of processors: {0}'.format(num_processors))
    print('Train seeds: {0}\n'.format(seeds))

    # Assess total number of processors.
    processors_total = mp.cpu_count()
    print(f'Total number of processors available: {processors_total}\n')

    # Adjust number of processors.
    if num_processors > processors_total:
        num_processors = processors_total
        print(f'Number of processors downgraded to {num_processors}\n')

    # Read train.py arguments from train.config file.
    baseline_config = configparser.ConfigParser()
    baseline_path = CONFIG_PATH / 'train.config'
    baseline_config.read(str(baseline_path))

    # Setup sumo-tls-type.
    baseline_config.set('train_args', 'tls_type', flags.tls_type)
    baseline_config.set('train_args', 'experiment_save_agent', str(False))

    # Override train configurations with test parameters.
    test_config = configparser.ConfigParser()
    test_path = CONFIG_PATH / 'test.config'
    test_config.read(test_path.as_posix())

    horizon = int(test_config.get('test_args', 'rollout-time'))
    baseline_config.set('train_args', 'experiment_time', str(horizon))

    # Write .xml files for test plots creation.
    baseline_config.set('train_args', 'sumo_emission', str(True))

    timestamp = datetime.now().strftime('%Y%m%d%H%M%S.%f')
    print(f'Experiment timestamp: {timestamp}')

    with tempfile.TemporaryDirectory() as tmp_dir:
        # Create a config file for each train.py
        # with the respective seed. These config
        # files are stored in a temporary directory.
        tmp_path = Path(tmp_dir)
        baseline_configs = []
        for seed in seeds:

            cfg_path = tmp_path / f'{flags.tls_type}-{seed}.config'
            baseline_configs.append(cfg_path)

            # Setup train seed.
            baseline_config.set("train_args", "experiment_seed", str(seed + 1))

            # Write temporary train config file.
            with cfg_path.open('w') as ft:
                baseline_config.write(ft)

        # rvs: directories' names holding experiment data
        if num_processors > 1:
            packed_args = [(delay, cfg) for (
                delay,
                cfg) in zip(range(len(baseline_configs)), baseline_configs)]
            pool = NonDaemonicPool(num_processors)
            rvs = pool.map(delay_baseline, packed_args)
            pool.close()
            pool.join()
        else:
            rvs = []
            for cfg in baseline_configs:
                rvs.append(delay_baseline((0.0, cfg)))

        # Create a directory and move newly created files
        paths = [Path(f) for f in rvs]
        commons = [p.parent for p in paths]
        if len(set(commons)) > 1:
            raise ValueError(
                f'Directories {set(commons)} must have the same root')
        dirpath = commons[0]
        batchpath = dirpath / timestamp
        if not batchpath.exists():
            batchpath.mkdir()

        # Move files
        for src in paths:
            dst = batchpath / src.parts[-1]
            src.replace(dst)

    sys.stdout.write(str(batchpath))

    return str(batchpath)