Exemple #1
0
def process_iteration(Ns, ps, landscape, config):
	output_dir = config.output_dir + config.ext
	
	if config.background_image != None:
		background_path = config.input_dir + "/" + config.background_image
	else:
		background_path = None
	
	#Create a point to hold the iteration
	p = Point()
	p.add_iteration()
	
	#draw_population(Ns[0], landscape, ps.totalK, 0, output_dir, 2.0, background_path)
	
	if config.display:
		pool = mp.Pool(config.num_processors)

	for t in xrange(min(ps.max_time_steps, len(Ns))):
		if config.display:
			pool.apply_async(draw_population, [Ns[t], landscape, ps.totalK, t, output_dir, 2.0, background_path])
		
		p.add_time_step([t] + population_statistics(ps, landscape, Ns[t]))
	
	pool.close()

	#Write the iteration results to file as a trajectory containing a single point
	write_trajectories([Trajectory(points=[p])], None, ps.sentinels, output_dir + "/results.txt")

	if config.save_time_steps:
		np.savez(output_dir + "/populations.npz", *Ns)

	pool.join()
def pass_data_to_search(symbol,path,start_time_seconds,end_time_seconds,date,time_interval,tt,code_path):

    jobs=[]
    dic_files={}
    lis=[]
    slot_results=[]
    
    file_name = path+'b'+date+'.l.bz2'
    # file_name = path + date+'/'+dic_files[lis[index]]+'.bz2'
        
    size=os.path.getsize(file_name)
    total_rows=size/69
    total_processes1=40
    slots=total_rows/total_processes1

    #Multiprocessing each file as chunk
    # mapper(0,slots,total_processes1,symbol,start_time_seconds,end_time_seconds,date,time_interval,file_name,tt,code_path)
    # mapper(1,slots,total_processes1,symbol,start_time_seconds,end_time_seconds,date,time_interval,file_name,tt,code_path)
    
    pool = multiprocessing.Pool(total_processes1)
    

    for i in range(total_processes1):

        pool.apply_async(mapper, args = (i,slots,total_processes1,symbol,start_time_seconds,end_time_seconds,date,time_interval,file_name,tt,code_path))
        
    pool.close()
    pool.join()    
    def _listArtifacts(self, urls, gavs):
        """
        Loads maven artifacts from list of GAVs and tries to locate the artifacts in one of the
        specified repositories.

        :param urls: repository URLs where the given GAVs can be located
        :param gavs: List of GAVs
        :returns: Dictionary where index is MavenArtifact object and value is it's repo root URL.
        """
        def findArtifact(gav, urls, artifacts):
            artifact = MavenArtifact.createFromGAV(gav)
            for url in urls:
                if maven_repo_util.gavExists(url, artifact):
                    #Critical section?
                    artifacts[artifact] = ArtifactSpec(url, [ArtifactType(artifact.artifactType, True, set(['']))])
                    return

            logging.warning('Artifact %s not found in any url!', artifact)

        artifacts = {}
        pool = ThreadPool(maven_repo_util.MAX_THREADS)
        for gav in gavs:
            pool.apply_async(findArtifact, [gav, urls, artifacts])

        # Close the pool and wait for the workers to finnish
        pool.close()
        pool.join()

        return artifacts
def main():
    if len(sys.argv) < 3:
        print("Syntax:")
        print(
            "  {} [min_yeses] [out_csv_file]".format(
                sys.argv[0]
            )
        )
        sys.exit(1)

    min_yeses = eval(sys.argv[1])
    out_csv_file = sys.argv[2]

    pconfig = config.PaperworkConfig()
    pconfig.read()

    src_dir = pconfig.settings['workdir'].value
    print("Source work directory : {}".format(src_dir))
    src_dsearch = docsearch.DocSearch(src_dir)
    src_dsearch.reload_index()

    nb_threads = multiprocessing.cpu_count()
    pool = multiprocessing.pool.ThreadPool(processes=nb_threads)

    with open(out_csv_file, 'a', newline='') as csvfile:
        csvwriter = csv.writer(csvfile)
        for min_yes in min_yeses:
            pool.apply_async(
                _run_simulation,
                (src_dsearch, min_yes, csvwriter,)
            )
        pool.close()
        pool.join()
    print("All done !")
Exemple #5
0
def papply( f, seq, pool_size=cores, callback=None ):
    """
    Apply the given function to each element of the given sequence, optionally invoking the given
    callback with the result of each application. Do so in parallel, using a thread pool no
    larger than the given size.

    :param callable f: the function to be applied

    :param Sequence seq: the input sequence

    :param int pool_size: the desired pool size, if absent the number of CPU cores will be used.
            The actual pool size may be smaller if the input sequence is small.

    :param callable callback: an optional function to be invoked with the return value of f

    >>> l=[]; papply( lambda a, b: a + b, [], 1, callback=l.append ); l
    []
    >>> l=[]; papply( lambda a, b: a + b, [ (1, 2) ], 1, callback=l.append); l
    [3]
    >>> l=[]; papply( lambda a, b: a + b, [ (1, 2), (3, 4) ], 1, callback=l.append ); l
    [3, 7]
    """
    if pool_size == 1:
        for args in seq:
            result = apply( f, args )
            if callback is not None:
                callback( result )
    else:
        with thread_pool( min( pool_size, len( seq ) ) ) as pool:
            for args in seq:
                pool.apply_async( f, args, callback=callback )
def create_process_pool(index):
    print index
    li = range(3)
    pool = multiprocessing.Pool(processes = len(li))
    for sub_index in li:
        pool.apply_async(print_process_index, (index, sub_index))
    pool.close()
    pool.join()
Exemple #7
0
def update_all(opts):
    """Updates all menus"""
    pool = NoDaemonPool(processes=5)
    pool.apply_async(update_applications, (opts,))
    pool.apply_async(update_bookmarks, (opts,))
    pool.apply_async(update_recent_files, (opts,))
    pool.apply_async(update_devices, (opts,))
    pool.apply_async(update_rootmenu, (opts,))
    pool.close()
    pool.join()
    def buildList(self):
        """
        Build the artifact "list" from sources defined in the given configuration.

        :returns: Dictionary described above.
        """
        priority = 0
        pool_dict = {}

        for source in self.configuration.artifactSources:
            priority += 1
            pool = pool_dict.setdefault(source['type'], ThreadPool(self.MAX_THREADS_DICT[source['type']]))
            pool.apply_async(self._read_artifact_source, args=[source, priority],
                             callback=self._add_result)

        for pool in pool_dict.values():
            pool.close()

        at_least_1_runs = True
        all_keys = range(1, len(self.configuration.artifactSources) + 1)
        finished = False
        while at_least_1_runs:
            for i in range(30):
                time.sleep(1)

                if not self.errors.empty():
                    for pool in pool_dict.values():
                        logging.debug("Terminating pool %s", str(pool))
                        pool.terminate()
                    finished = True
                    break

            at_least_1_runs = False
            if not finished:            
                self.results_lock.acquire()
                finished = sorted(list(self.results.keys()))
                self.results_lock.release()
                if all_keys != finished:
                    logging.debug("Still waiting for priorities %s to finish", str(list(set(all_keys) - set(finished))))
                    at_least_1_runs = True

        for pool in pool_dict.values():
            if pool._state != multiprocessing.pool.TERMINATE:
                pool.join()

        if not self.errors.empty():
            raise RuntimeError("%i error(s) occured during reading of artifact list." % self.errors.qsize())

        return self._get_artifact_list()
Exemple #9
0
def papply( f, seq, pool_size=cores, callback=None ):
    """
    Apply the given function to each element of the given sequence, optionally invoking the given
    callback with the result of each application. Do so in parallel, using a thread pool no
    larger than the given size.

    :param callable f: the function to be applied

    :param Sequence seq: the input sequence

    :param int pool_size: the desired pool size, if absent the number of CPU cores will be used.
    The actual pool size may be smaller if the input sequence is small.A pool size of 0 will make
    this function emulate the apply() builtin, i.e. f (and the callback, if provided) will be
    invoked serially in the current thread.

    :param callable callback: an optional function to be invoked with the return value of f

    >>> l=[]; papply( lambda a, b: a + b, [], pool_size=0, callback=l.append ); l
    []
    >>> l=[]; papply( lambda a, b: a + b, [ (1, 2) ], pool_size=0, callback=l.append); l
    [3]
    >>> l=[]; papply( lambda a, b: a + b, [ (1, 2), (3, 4) ], pool_size=0, callback=l.append ); l
    [3, 7]
    >>> l=[]; papply( lambda a, b: a + b, [], pool_size=1, callback=l.append ); l
    []
    >>> l=[]; papply( lambda a, b: a + b, [ (1, 2) ], pool_size=1, callback=l.append); l
    [3]
    >>> l=[]; papply( lambda a, b: a + b, [ (1, 2), (3, 4) ], pool_size=1, callback=l.append ); l
    [3, 7]
    >>> l=[]; papply( lambda a, b: a + b, [], pool_size=2, callback=l.append ); l
    []
    >>> l=[]; papply( lambda a, b: a + b, [ (1, 2) ], pool_size=2, callback=l.append); l
    [3]
    >>> l=[]; papply( lambda a, b: a + b, [ (1, 2), (3, 4) ], pool_size=2, callback=l.append ); l
    [3, 7]
    """
    __check_pool_size( pool_size )
    n = len( seq )
    if n:
        if pool_size == 0:
            for args in seq:
                result = apply( f, args )
                if callback is not None:
                    callback( result )
        else:
            with thread_pool( min( pool_size, n ) ) as pool:
                for args in seq:
                    pool.apply_async( f, args, callback=callback )
 def func_wrapper(*args, **kwargs):
     """Closure for function."""
     pool = multiprocessing.pool.ThreadPool(processes=1)
     async_result = pool.apply_async(item, args, kwargs)
     # raises a TimeoutError if execution exceeds max_timeout
     # print async_result.get(max_timeout)
     return async_result.get(max_timeout)
Exemple #11
0
def run_trajectory(t, ps, landscape, ptv, num_iterations, num_processors):
    # Get the points in the trajectory
    points = t.points()

    # Determine the index of each unique point (sometimes points are equal due to rounding)
    uinds = [i for i, p in enumerate(points) if i == 0 or not p.equals(points[i - 1])]

    # Create a process pool, using as many processors as are available, or
    # are required to allow each point to run concurrently
    pool = mp.Pool(processes=min(num_processors, len(points)))

    results = []
    for i in uinds:
        # Modify the parameter set to match the current point
        psm = ps.copy()
        psm.modify_for_point(points[i], ptv)
        psm.convert_to_age_classes()

        # Launch a process to run the simulation(s) for the point. This modifies the point in place
        args = [points[i], psm, landscape, num_iterations, num_processors]
        results.append(pool.apply_async(run_iterations_for_point, args))

    pool.close()
    pool.join()

    # Merge the unique and non-unique points back together
    for i, r in zip(uinds, results):
        points[i] = r.get(None)

    # Return a new trajectory containing the results for each point
    return io.Trajectory(points=points)
 def func_wrapper(self, *args, **kwargs):
     """Closure for function."""
     pool = multiprocessing.pool.ThreadPool(processes=1)
     async_result = pool.apply_async(f, (self,) + args, kwargs)
     timeout = kwargs.pop('timeout_max_timeout', max_timeout) or max_timeout
     # raises a TimeoutError if execution exceeds max_timeout
     return async_result.get(timeout)
def compute_stats(client_factory, db_names=None, table_names=None,
    continue_on_error=False, parallelism=multiprocessing.cpu_count()):
  """
  Runs COMPUTE STATS over the selected tables. The target tables can be filtered by
  specifying a list of databases and/or table names. If no filters are specified this will
  run COMPUTE STATS on all tables in all databases.

  parallelism controls the size of the thread pool to which compute_stats
  is sent.
  """
  logging.info("Enumerating databases and tables for compute stats.")

  pool = multiprocessing.pool.ThreadPool(processes=parallelism)
  futures = []
  with client_factory() as impala_client:
    all_dbs = set(name.split('\t')[0].lower() for name
        in impala_client.execute("show databases").data)
    selected_dbs = all_dbs if db_names is None else set(db_names)
    for db in all_dbs.intersection(selected_dbs):
      all_tables =\
          set([t.lower() for t in impala_client.execute("show tables in %s" % db).data])
      selected_tables = all_tables if table_names is None else set(table_names)
      for table in all_tables.intersection(selected_tables):
        # Submit command to threadpool
        futures.append(pool.apply_async(compute_stats_table,
            (client_factory, db, table, continue_on_error,)))
    # Wait for all stats commands to finish
    for f in futures:
      f.get()
Exemple #14
0
  def _run_tests(self):
    pool = multiprocessing.pool.ThreadPool(processes=self.suite_concurrency)
    outstanding_suites = []
    for suite in self.suite_runners:
      suite.task = pool.apply_async(suite.run)
      outstanding_suites.append(suite)

    ret = True
    try:
      while len(outstanding_suites) > 0:
        for suite in list(outstanding_suites):
          if suite.timed_out():
            msg = "Task %s not finished within timeout %s" % (suite.name,
                suite.suite.timeout_minutes,)
            logging.error(msg)
            raise Exception(msg)
          task = suite.task
          if task.ready():
            this_task_ret = task.get()
            outstanding_suites.remove(suite)
            if this_task_ret:
              logging.info("Suite %s succeeded.", suite.name)
            else:
              logging.info("Suite %s failed.", suite.name)
              ret = False
        time.sleep(5)
    except KeyboardInterrupt:
      logging.info("\n\nDetected KeyboardInterrupt; shutting down!\n\n")
      raise
    finally:
      pool.terminate()
    return ret
Exemple #15
0
def parallel_reduce(func, iterable, processes= 4, args=(), kwargs={}):    
    #print "Made it to parallel reduce!"
    #print 'Iterable Set to Reduce: ', iterable
    
    comp_stack = list(iterable)
    pair_list = []
    
    pool = multiprocessing.pool.Pool(processes)

    while len(comp_stack) > 1:
        while len(comp_stack) > 1:
            pair_list.append((comp_stack.pop(), comp_stack.pop()))
            
        #print 'List of pairs to reduce: ', pair_list
    
        results = []
        while len(pair_list) > 0:    
            pair = pair_list.pop()
            results.append(pool.apply_async(func, pair))
    
        #print 'Async Result Objects: ', results
                
        while True:
            if all([result.ready() for result in results]): break
                    
        comp_stack = [result.get() for result in results]
        #print 'After reduce: ', comp_stack
    
    return comp_stack
Exemple #16
0
def from_carrays(path, format_categories='bcolz', format_codes='bcolz', format_values='bcolz', parallel=True):
    assert os.path.exists(path), 'No path {}'.format(path)
    df_columns = glob.glob(os.path.join(path, '*'))
    df = dict()
    if parallel:
        pool = multiprocessing.pool.ThreadPool()
        results = []
        for i, k in enumerate(df_columns):
            p = pool.apply_async(_from_carray, args=(k,), kwds={'format_categories': format_categories, 'format_codes': format_codes, 'format_values': format_values})
            results.append(p)
        pool.close()
        pool.join()
        for x in results:
            meta, s = x.get()
            df[meta['name']] = s
    else:
        for i, k in enumerate(df_columns):
            meta, s = _from_carray(k, format_categories=format_categories, format_codes=format_codes, format_values=format_values)
            df[meta['name']] = s

    # # # this is slow when we have non categoricals as series for some reason
    with log.timedlogger('constructing dataframe from %s column dict' % len(df)):
        df = pandas.DataFrame(df)  # TODO: fast DataFrame constructor

    return df
def main():
  if len(sys.argv) > 1:
    _, pkg_name, pkg_version = sys.argv
    download_package(pkg_name, pkg_version)
    return

  pool = multiprocessing.pool.ThreadPool(processes=min(multiprocessing.cpu_count(), 4))
  results = []

  for requirements_file in REQUIREMENTS_FILES:
    # If the package name and version are not specified in the command line arguments,
    # download the packages that in requirements.txt.
    # requirements.txt follows the standard pip grammar.
    for line in open(requirements_file):
      # A hash symbol ("#") represents a comment that should be ignored.
      line = line.split("#")[0]
      # A semi colon (";") specifies some additional condition for when the package
      # should be installed (for example a specific OS). We can ignore this and download
      # the package anyways because the installation script(bootstrap_virtualenv.py) can
      # take it into account.
      l = line.split(";")[0].strip()
      if not l:
        continue
      pkg_name, pkg_version = l.split('==')
      results.append(pool.apply_async(
        download_package, args=[pkg_name.strip(), pkg_version.strip()]))

    for x in results:
      x.get()
Exemple #18
0
 def _queue_job(self, pool, key, data_file, data_file_size):
     pool.apply_async(
         _fetch_and_process_chunk,
         [],
         {
             "app_config": self.config,
             "debug": self.debug,
             "data_file": data_file,
             "data_file_size": data_file_size,
             "download_progress_per_file": self.download_progress_per_file,
             "site": self.site,
             "pgdata": self.pgdata,
             "tablespaces": self.tablespaces,
         },
         lambda *args: self.job_completed(key),
         lambda exception: self.job_failed(key, exception),
     )
 def test_multi_own_ca(self):
     pool = multiprocessing.pool.ThreadPool(processes=5)
     threads = []
     for i in range(5):
         threads.append(pool.apply_async(issue_n_certs, ("ownca", range(5))))
     vals = []
     for t in threads:
         vals.extend(t.get())
     nt.assert_equal(sorted(vals), sorted(list(set(vals))))
Exemple #20
0
def update(opts):
    opts = options_from_config(opts)
    actions = []
    if opts.with_applications:
        actions.append(update_applications)
    if opts.with_bookmarks:
        actions.append(update_bookmarks)
    if opts.with_recent_files:
        actions.append(update_recent_files)
    if opts.with_devices:
        actions.append(update_devices)
    num_actions = len(actions)
    if num_actions == 1:
        actions[0](opts)
    else:
        pool = NoDaemonPool(processes=num_actions)
        for action in actions:
            pool.apply_async(action, (opts, True))
        pool.close()
        pool.join()
def prepopulate(gitURLs):
    pool = multiprocessing.pool.ThreadPool(_CONCURRENCY)
    futures = []
    for url in gitURLs:
        if url in _cache:
            continue
        mirror = repomirror.RepoMirror(url)
        future = pool.apply_async(_fetchSubthread, args=(mirror,))
        futures.append(future)
    for future in futures:
        future.get()
    def test_multithread(self):
        ca = ezbakeca.EzbakeCA("threadingCA")
        pool = multiprocessing.pool.ThreadPool(processes=5)

        threads = []
        for i in range(5):
            threads.append(pool.apply_async(issue_n_certs, (ca, range(5))))
        vals = []
        for t in threads:
            vals.extend(t.get())
        nt.assert_equal(sorted(vals), sorted(list(set(vals))))
Exemple #23
0
def test_recursive_parallel_reduce(workers = 5):
    
    pool = RecursivePool()
    
    ranges = [range(1, 5), range(2, 9), range(3, 7)]
    
    print ranges
    
    results = []

    for myrange in ranges:
        pool.apply_async(parallel_reduce, [sum, myrange], 
            callback= results.append)

    pool.close()
    pool.join()

    print results

#if __name__ == '__main__':
#    test_recursive_parallel_reduce()
    def test_multithread(self):
        ca = ezbakeca.EzbakeCA("threadingCA")
        ca.save()
        pool = multiprocessing.pool.ThreadPool(processes=5)

        threads = []
        for i in range(5):
            threads.append(pool.apply_async(issue_n_certs, (ca, range(5))))
        vals = []
        for t in threads:
            vals.extend(t.get())
        ca.save() # save since the threads might still be writing the serial file
        nt.assert_equal(sorted(vals), sorted(list(set(vals))))
Exemple #25
0
    def do_get_sample_prompts_list(self):
        pool = multiprocessing.pool.ThreadPool(processes=8)

        # Kick off the "Current" meta-sample
        current_metasample_async = pool.apply_async(self.do_get_current_prompt)

        # Read all of the prompts in sample_prompts
        paths = glob.iglob('sample_prompts/*.fish')
        sample_results = pool.map(self.read_one_sample_prompt, paths, 1)

        # Finish up
        result = []
        result.append(current_metasample_async.get())
        result.extend([r for r in sample_results if r])
        return result
Exemple #26
0
def run_parrallel_iterations(ps, landscape, config):
    iteration = 0

    if config.num_iterations > 1:
        means = [np.zeros(landscape.shape) for _ in xrange(ps.max_time_steps)]

    #run_iteration(ps, landscape)

    #Perform the iterations using a process pool for concurrency
    pool = mp.Pool(num_processors)
    print "running...", num_iterations, "iterations for", ps.max_time_steps, "timesteps on", num_processors, "processors"
    results = [
        pool.apply_async(run_iteration, args=[ps, landscape])
        for _ in xrange(num_iterations)
    ]
    pool.close()

    #Process iterations as they complete
    while len(results) > 0:
        completed = [i for i, r in enumerate(results) if r.ready()]
        for i in reversed(
                completed
        ):  #reversed so that indices aren't invalidated as we pop
            print "processing iteration " + str(iteration + 1)
            #Get the result from the list and save the iteration to file
            Ns = results.pop(i).get(None)
            config.ext = "/iteration " + str(iteration + 1)
            process_iteration(Ns, ps, landscape, config)
            iteration += 1

            if config.num_iterations > 1:
                #Add the population for each time step in the iteration to the total
                for t, N in enumerate(Ns):
                    means[t] += N

        time.sleep(15)

    pool.close()
    pool.join()
    #run_iteration(ps, landscape)

    if config.num_iterations > 1:
        for N in means:
            N /= config.num_iterations
        config.ext = "/means"
        io.process_iteration(means, ps, landscape, config)
        def run_parallel(*args, **kwargs):
            # pylint: disable=missing-docstring
            if 'callback_' in kwargs:
                callback = kwargs['callback_']
                del kwargs['callback_']
            else:
                callback = None

            async_res = dict()
            pool = multiprocessing.pool.ThreadPool(len(self.nodes))
            for node in self.nodes:
                if callback:
                    node_callback = _insert_arg0(callback, node)
                else:
                    node_callback = None
                func = getattr(node, name)
                res = pool.apply_async(func, args, kwargs, node_callback)
                async_res[node] = res
            return {n: r.get() for n, r in async_res.iteritems()}
def run(jobs, threads=None):
    if threads is None:
        threads = len(jobs)
    pool = multiprocessing.pool.ThreadPool(processes=threads)
    try:
        futures = []
        for job in jobs:
            kwargs = dict(job)
            args = ()
            del kwargs['callback']
            if 'args' in job:
                args = job['args']
                del kwargs['args']
            futures.append(pool.apply_async(_safeRun, args=(job['callback'], args, kwargs)))
        for future in futures:
            future.wait(timeout=2 ** 31)
        for future in futures:
            future.get()
    finally:
        pool.close()
def main():
    parser = argparse.ArgumentParser(description='Download All Photos from iCloud')
    parser.add_argument('-apple_id', required=True, help="Your AppleID (password must be in KeyChain")
    parser.add_argument('-password', required=True)  # TODO switch to using keyring
    parser.add_argument('-folder', required=True, help='Path to Download Photos To')
    app_args = parser.parse_args()

    app_args.folder = os.path.expanduser(app_args.folder)

    with multiprocessing.pool.ThreadPool(10) as pool:
        api = PyiCloudService(app_args.apple_id, app_args.password)
        photos = list(api.photos.all)

        file_dups = dict()
        for photo in photos:
            if photo.filename not in file_dups:
                file_dups[photo.filename] = 1
            else:
                file_dups[photo.filename] += 1

        file_dups = {fname: count for fname, count in file_dups.items() if count > 1}

        for photo in photos:
            fname = photo.filename

            if fname in file_dups:
                assert file_dups[fname] > 0
                file_dups[fname] -= 1

                if file_dups[fname] > 0:
                    name, ext = fname.rsplit('.', 1)
                    fname = '{}-{}.{}'.format(name, file_dups[fname], ext)

            photo_path = os.path.join(app_args.folder, fname)
            if os.path.exists(photo_path) and os.path.getsize(photo_path) == photo.size:
                continue

            r = pool.apply_async(download_photo, [photo, photo_path])

        pool.close()
        pool.join()
Exemple #30
0
def main():
    readfilepath = sys.argv[1]
    writefilepath = sys.argv[2]

    with open(readfilepath) as readfile:
        target = readfile.readline()
        target = target.replace(" ", "").replace("\n", "").split(",")
        condition = {}
        for item in target:
            condition[item.split(":")[0]] = item.split(":")[1]
        target = readfile.readline()
        condition["sni"] = condition.get("sni", "on")
        if condition["sni"].lower() in ["on", "true", "1"]:
            condition["sni"] = True
        else:
            condition["sni"] = False
        condition["host"] = condition["host"].encode()
        condition["port"] = int(condition["port"])
        condition["process_num"] = int(condition.get("process_num", 1))
        print(condition)
        with closing(Pool(condition["process_num"])) as pool:
            while target:
                target = target.replace(" ", "").replace("\n", "").split("-")
                startip = ipaddress.ip_address(target[0])
                if len(target) > 1:
                    finiship = ipaddress.ip_address(target[1])
                else:
                    finiship = ipaddress.ip_address(target[0])
                currentip = startip - 1
                while currentip < finiship:
                    currentip = currentip + 1
                    nowip = str(currentip)
                    process = pool.apply_async(
                        check_host, [nowip, condition, writefilepath])
                target = readfile.readline()
            pool.close()
            pool.join()
Exemple #31
0
        elif cmd[0] == 'mul':
            regs[cmd[1]] *= val(cmd[2])
        elif cmd[0] == 'mod':
            regs[cmd[1]] %= val(cmd[2])
        elif cmd[0] == 'rcv':
            if inqueue:
                regs[cmd[1]] = inqueue.get()
            elif regs[cmd[1]] != 0:
                return played
        elif cmd[0] == 'jgz':
            if val(cmd[1]) > 0:
                pc += val(cmd[2])
                continue
        pc += 1

    return count


print('PART 1:', run(0, None, None))

pool = multiprocessing.pool.ThreadPool(processes=2)

q1 = multiprocessing.Queue()
q2 = multiprocessing.Queue()

res1 = pool.apply_async(run, (0, q1, q2))
res2 = pool.apply_async(run, (1, q2, q1))

res1.get()
print('PART 2:', res2.get())
    mongo = MongoDBConnection()
    with mongo:
        db = mongo.connection.HPNorton

        file_list = (product_file, customer_file)
        logging.debug('Successfully obtained file list')
        products = db['products']
        customers = db['customers']
        database_list = (products, customers)
        logging.debug('Got database list, going through files now')
        final_list = []
        MP_list = []
        pool = multiprocessing.pool.ThreadPool(processes=2)
        for filename, database in zip(file_list, database_list):
            logging.debug('Attempting to open %s/%s', directory_name, filename)
            MP = pool.apply_async(insert_data,
                                  (directory_name, filename, database))
            MP_list.append(MP)

        list1 = MP_list[0]
        list2 = MP_list[1]
        final_list = [list1.get(), list2.get()]
        print(final_list)
        return final_list



def import_data(directory_name, product_file, customer_file):
    """
    Takes a directory name three csv files on input (product data, customer data, rentals
    data) and populates new mongo DB and returns two tuples (record count of number or products
    customers, rentals added) (second with count of number of errors occured)
 def __init__(self,
              directory,
              image_data_generator,
              target_size=(256, 256),
              class_mode='binary',
              tags=(('satellite', 'jpg'), ('roadmap', 'png')),
              batch_size=32,
              shuffle=True,
              seed=None,
              data_format=None,
              save_to_dir=None,
              save_prefix='',
              save_format='png',
              subset=None,
              interpolation='nearest'):
     if data_format is None:
         data_format = K.image_data_format()
     self.directory = directory
     self.image_data_generator = image_data_generator
     self.target_size = tuple(target_size)
     if len(tags) != 2:
         raise ValueError('Invalid tags:', tags,
                          '; expected tuple of two tuples.')
     if len(tags[0]) != 2 or len(tags[1]) != 2:
         raise ValueError('Invalid tags:', tags,
                          '; expected tuples of two strings.')
     self.tags = tags
     self.data_format = data_format
     if self.data_format == 'channels_last':
         self.image_shape = self.target_size + (6, )
     else:
         self.image_shape = (6, ) + self.target_size
     if class_mode not in {'binary', None}:
         raise ValueError('Invalid class_mode:', class_mode,
                          '; expected one of "binary" or None.')
     self.class_mode = class_mode
     self.save_to_dir = save_to_dir
     self.save_prefix = save_prefix
     self.save_format = save_format
     self.interpolation = interpolation
     if subset is not None:
         validation_split = self.image_data_generator._validation_split
         if subset == 'validation':
             split = (0, validation_split)
         elif subset == 'training':
             split = (validation_split, 1)
         else:
             raise ValueError('Invalid subset name: ', subset,
                              '; expected "training" or "validation"')
     else:
         split = None
     self.subset = subset
     # First, count the number of samples and classes.
     self.samples = 0
     classes = []
     for subdir in sorted(os.listdir(directory)):
         if os.path.isdir(os.path.join(directory, subdir)):
             classes.append(subdir)
     self.classes = classes
     self.num_classes = len(classes)
     self.class_indices = dict(zip(classes, range(len(classes))))
     pool = multiprocessing.pool.ThreadPool()
     function_partial = partial(_count_valid_files_in_directory,
                                tag=self.tags[0][0],
                                split=split)
     self.samples = sum(
         pool.map(function_partial,
                  (os.path.join(directory, subdir) for subdir in classes)))
     print('Found %d images belonging to %d classes.' %
           (self.samples, self.num_classes))
     # Second, build an index of the images
     # in the different class subfolders.
     results = []
     self.filenames = []
     self.classes = np.zeros((self.samples, ), dtype='int32')
     i = 0
     for dirpath in (os.path.join(directory, subdir) for subdir in classes):
         results.append(
             pool.apply_async(
                 _list_valid_filenames_in_directory,
                 (dirpath, split, self.class_indices, self.tags[0][0])))
     for res in results:
         classes, filenames = res.get()
         self.classes[i:i + len(classes)] = classes
         self.filenames += filenames
         i += len(classes)
     pool.close()
     pool.join()
     super(DirectoryIterator, self).__init__(self.samples, batch_size,
                                             shuffle, seed)
Exemple #34
0
    def run_function_different_arguments_parallel(cls, function, arguments, all_success=False,
                                                  signal=None, parallel=True, threads=0,
                                                  *args, **kwargs):
        """
        Call functions in parallel
        :param function: f(argument, **kwargs)
        :param arguments: {i: argument}
        :param all_success: (boolean) the function will raise an exception if one of the runs
            fail and all_success is True
        :param signal: (function) calls this function after generating the jobs. It's used to test
            KeyboardInterrupt, and the signal is a mock of KeyboardInterrupt.
        :param parallel: (boolean) The code is run in parallel only if it's True.
        :param threads: (int) Uses threads instead of processes if threads > 0
        :param args: additional arguments of function
        :param kwargs: additional arguments of function
        :return: {int: output of f(arguments[i])}
        """
        # Maybe later we enable this feature.
        #thread = False

        jobs = {}

        if not parallel:
            return cls.run_function_different_arguments_sequentially(function, arguments, *args,
                                                                     **kwargs)

        n_jobs = min(len(arguments), mp.cpu_count())

        if threads > 0:
            pool = ThreadPool(threads)
        else:
            pool = mp.Pool(processes=n_jobs)

        try:
            for key, argument in arguments.iteritems():
                job = pool.apply_async(function, args=(argument, ) + args, kwds=kwargs)
                jobs[key] = job
            pool.close()
            pool.join()
            if signal is not None:
                signal(1)
        except KeyboardInterrupt:
            logger.info("Ctrl+c received, terminating and joining pool.")
            pool.terminate()
            pool.join()
            return -1

        results = {}
        for key in arguments.keys():
            try:
                results[key] = jobs[key].get()
            except Exception as e:
                if all_success:
                    raise e
                else:
                    logger.info("job failed")
                    logger.info(key)
                    logger.info(argument)
                    logger.info(args)
                    logger.info(kwargs)
        return results
Exemple #35
0
    def __init__(self,
                 directory,
                 classes=None,
                 number_subsequences=32,
                 dim=(32, 32, 32),
                 n_channels=6,
                 n_classes=10,
                 shuffle=True,
                 n_samples=None,
                 seed=None,
                 faster=True,
                 online_training=False,
                 repeat=True,
                 use_spacer=False,
                 randomrepeat=False,
                 sequence_length=50,
                 full_seq_embedding=False,
                 final_set=True,
                 include_raptorx_iupred=False,
                 include_dict_scores=False,
                 non_binary=False,
                 **kwargs):
        'Initialization'
        self.directory = directory
        self.classes = classes
        self.dim = dim
        self.labels = None
        self.list_IDs = None
        self.n_channels = n_channels
        self.shuffle = shuffle
        self.seed = seed
        self.online_training = online_training
        self.repeat = repeat
        self.use_spacer = use_spacer
        self.randomrepeat = randomrepeat
        self.maxLen = kwargs.get("maxLen", None)
        self.sequence_length = sequence_length
        self.full_seq_embedding = full_seq_embedding
        self.final_set = final_set
        self.include_raptorx_iupred = include_raptorx_iupred
        self.include_dict_scores = include_dict_scores
        self.non_binary = non_binary

        if full_seq_embedding:
            file_format = 'pkl'
        else:
            file_format = 'csv'

        if number_subsequences == 1:
            self.shrink_timesteps = False
        else:
            self.shrink_timesteps = True

        self.number_subsequences = number_subsequences

        if faster == True:
            self.faster = 16
        elif type(faster) == int and faster > 0:
            self.faster = faster
        else:
            self.faster = 1

        self.number_samples_per_batch = self.faster

        self.number_samples_per_class_to_pick = n_samples

        if not classes:
            classes = []
            for subdir in sorted(os.listdir(directory)):
                if os.path.isdir(os.path.join(directory, subdir)):
                    classes.append(subdir)
            self.classes = classes

        self.n_classes = len(classes)
        self.class_indices = dict(zip(classes, range(len(classes))))
        print(self.class_indices)
        # want a dict which contains dirs and number usable files
        pool = multiprocessing.pool.ThreadPool()
        function_partial = partial(_count_valid_files_in_directory,
                                   white_list_formats={file_format},
                                   follow_links=None,
                                   split=None)
        self.samples = pool.map(function_partial,
                                (os.path.join(directory, subdir)
                                 for subdir in classes))
        self.samples = dict(zip(classes, self.samples))

        results = []

        for dirpath in (os.path.join(directory, subdir) for subdir in classes):
            results.append(
                pool.apply_async(
                    utils._list_valid_filenames_in_directory,
                    (dirpath, {file_format}, None, self.class_indices, None)))

        self.filename_dict = {}
        for res in results:
            classes, filenames = res.get()
            for index, class_i in enumerate(classes):
                self.filename_dict.update(
                    {f"{class_i}_{index}": filenames[index]})

        pool.close()
        pool.join()

        if not n_samples:
            self.number_samples_per_class_to_pick = min(self.samples.values())

        self.elmo_embedder = Elmo_embedder()

        self.on_epoch_end()
    def __init__(self, directory, image_data_generator,
                 target_size=(256, 256), color_mode='rgb',
                 classes=None, class_mode='categorical',
                 batch_size=32, shuffle=True, seed=None,
                 data_format=None, vector_length=512, dimension=256,
                 save_to_dir=None, save_prefix='', save_format='png',
                 follow_links=False):
        if data_format is None:
            data_format = K.image_data_format()
        self.directory = directory
        self.image_data_generator = image_data_generator
        self.target_size = tuple(target_size)
        self.vector_length = vector_length
        if color_mode not in {'rgb', 'grayscale'}:
            raise ValueError('Invalid color mode:', color_mode,
                             '; expected "rgb" or "grayscale".')
        self.color_mode = color_mode
        self.data_format = data_format
        if self.color_mode == 'rgb':
            if self.data_format == 'channels_last':
                self.image_shape = self.target_size + (3,)
            else:
                self.image_shape = (3,) + self.target_size
        else:
            if self.data_format == 'channels_last':
                self.image_shape = self.target_size + (1,)
            else:
                self.image_shape = (1,) + self.target_size
        self.classes = classes
        self.dimension = dimension
        if class_mode not in {'categorical', 'binary', 'sparse',
                              'input', 'input_g_c', 'colorize', 'kl_divergence',
                              None}:
            raise ValueError('Invalid class_mode:', class_mode,
                             '; expected one of "categorical", '
                             '"binary", "sparse", "input"'
                             ' or None.')
        self.class_mode = class_mode
        self.save_to_dir = save_to_dir
        self.save_prefix = save_prefix
        self.save_format = save_format

        white_list_formats = {'png', 'jpg', 'jpeg', 'bmp'}

        # first, count the number of samples and classes
        self.samples = 0

        if not classes:
            classes = []
            for subdir in sorted(os.listdir(directory)):
                if os.path.isdir(os.path.join(directory, subdir)):
                    classes.append(subdir)
        self.num_class = len(classes)
        self.class_indices = dict(zip(classes, range(len(classes))))

        def _recursive_list(subpath):
            return sorted(os.walk(subpath, followlinks=follow_links), key=lambda tpl: tpl[0])

        pool = multiprocessing.pool.ThreadPool()
        function_partial = partial(_count_valid_files_in_directory,
                                   white_list_formats=white_list_formats,
                                   follow_links=follow_links)
        self.samples = sum(pool.map(function_partial,
                                    (os.path.join(directory, subdir)
                                     for subdir in classes)))

        print('Found %d images belonging to %d classes.' % (self.samples, self.num_class))

        # second, build an index of the images in the different class subfolders
        results = []

        self.filenames = []
        self.classes = np.zeros((self.samples,), dtype='int32')
        i = 0
        for dirpath in (os.path.join(directory, subdir) for subdir in classes):
            results.append(pool.apply_async(_list_valid_filenames_in_directory,
                                            (dirpath, white_list_formats,
                                             self.class_indices, follow_links)))
        for res in results:
            classes, filenames = res.get()
            self.classes[i:i + len(classes)] = classes
            self.filenames += filenames
            i += len(classes)
        pool.close()
        pool.join()
        super(DirectoryIterator, self).__init__(self.samples, batch_size, shuffle, seed)
def main():

    # SYSTEM VARIABLES
    Strategies_to_test = [100]
    strategy_runs = 100
    randomize = True
    verbose = True
    Market_history_file = './Data/Backtest_data/BacktestData_2020-21.xlsx'

    comission_pcnt = 0.02  # 2% Betfair comission
    initial_balance = 1100
    min_bet = 0  #Minimum wager (e.g. Betfair exchange)
    max_bet = 500  #Maximum bet the market can take? (much higher for Sportsbook)
    f = 0.1  #percentile of balance to bet
    Fixed_bet_amount = round(initial_balance *
                             0.025)  #for all fixed bet strategies
    saving = 0.0  #save percentagex100 of balance above initial balance

    # LOAD BACKTEST EXCEL DATA
    no_games, results, FULL_our_probs, FULL_our_prediction, FULL_market_odds,\
     FULL_market_prediction, CARMELO,  COVERS, ODDSHARK, H2H,  ODDSHARK_LastN_Away, ODDSHARK_LastN_Home = LoadBackTestData(Market_history_file)

    # STRATEGIES EVALUATION
    #variables for each strategy
    num_of_strategies = len(Strategies_to_test)
    StratStats = []
    running_stats = []
    for strats in range(num_of_strategies):
        StratStats.append([])
        running_stats.append([])

    #run strategies multiple times
    if randomize is False:
        strategy_runs = 1
    if strategy_runs > 1 or num_of_strategies > 1:
        verbose = False

    if verbose:

        plt.figure()
        ax = plt.axes()
        ax2 = ax.twinx(
        )  # instantiate a second axes that shares the same x-axis
        ax.set_xlabel('Bets')
        ax.set_ylabel('Balance', color='blue')
        ax.tick_params(axis='y', labelcolor='blue')

    else:
        ax = None
        ax2 = None

    #Single processing. Only for manual entries or for single runs
    if strategy_runs == 1:
        running_stats= StrategiesRun(Strategies_to_test,min_bet,max_bet,results,f,Fixed_bet_amount, \
                               FULL_our_probs, FULL_our_prediction, FULL_market_odds, FULL_market_prediction,  \
                               initial_balance, no_games, randomize,verbose,comission_pcnt,ax,ax2, saving)

    else:

        #Multiprocessing
        cpus = 12
        pool = multiprocessing.Pool(processes=cpus)
        results = [pool.apply_async(StrategiesRun, args=(Strategies_to_test,min_bet,max_bet,results,f,Fixed_bet_amount, \
                                FULL_our_probs, FULL_our_prediction, FULL_market_odds, FULL_market_prediction, \
                                initial_balance, no_games, randomize,verbose,comission_pcnt,ax,ax2, saving)) for i in range(strategy_runs)]
        pool.close()
        pool.join()

        #Gather the results
        for p in results:
            for strats in range(num_of_strategies):
                running_stats[strats].append(p.get()[strats][0])

    #average stats over runs
    for strats in range(num_of_strategies):
        StratStats[strats] = average_Strategy_Stats(running_stats[strats])

    #SAVE TO EXCEL
    workbook = openpyxl.Workbook()
    worksheet = workbook.worksheets[0]

    fields = dir(StratStats[0])
    for strats in range(num_of_strategies):
        field_count = 1

        worksheet.cell(0 + 1, strats + 1 + 1).value = str(
            StratStats[strats].StrategyName
        )  #Strategy name (Header). Cell indices start from 1

        for i in range(1, len(fields)):
            if "__" not in fields[i]:  #skip over internal fields of the struct

                worksheet.cell(field_count + 1, 0 + 1).value = str(
                    fields[i])  #Field name (Header). Cell indices start from 1
                exec(
                    "worksheet.cell(field_count+1, strats+1+1).value=     StratStats[strats]."
                    + fields[i])  #Field data. Cell indices start from 1
                field_count = field_count + 1

    workbook.save("./Data/Backtest_data/Backtest_simulations.xlsx")
Exemple #38
0
 def wrapper(*args, **kw):
     pool = multiprocessing.pool.ThreadPool(processes=1)
     async_result = pool.apply_async(func, args, kw)
     return async_result.get(seconds)
    def __init__(self,
                 directory,
                 sound_data_generator,
                 target_size=(256, 256),
                 classes=None,
                 class_mode='categorical',
                 batch_size=32,
                 shuffle=True,
                 seed=None,
                 follow_links=False,
                 subset=None,
                 interpolation='nearest',
                 dtype='float32'):

        super(DirectoryIterator,
              self).set_processing_attrs(sound_data_generator, target_size,
                                         subset)

        self.directory = directory
        self.classes = classes
        if class_mode not in self.allowed_class_modes:
            raise ValueError(
                'Invalid class_mode: {}; expected one of: {}'.format(
                    class_mode, self.allowed_class_modes))
        self.class_mode = class_mode
        self.dtype = dtype

        self.samples = 0

        if not classes:
            classes = []
            for subdir in sorted(os.listdir(directory)):
                if os.path.isdir(os.path.join(directory, subdir)):
                    classes.append(subdir)
        self.num_classes = len(classes)
        self.class_indices = dict(zip(classes, range(len(classes))))

        pool = multiprocessing.pool.ThreadPool()

        results = []
        self.filenames = []
        i = 0
        for dirpath in (os.path.join(directory, subdir) for subdir in classes):
            results.append(
                pool.apply_async(_list_valid_filenames_in_directory,
                                 (dirpath, self.white_list_formats, self.split,
                                  self.class_indices, follow_links)))
        classes_list = []
        for res in results:
            classes, filenames = res.get()
            classes_list.append(classes)
            self.filenames += filenames
        self.samples = len(self.filenames)
        self.classes = np.zeros((self.samples, ), dtype='int32')
        for classes in classes_list:
            self.classes[i:i + len(classes)] = classes
            i += len(classes)

        print('Found %d wav files belonging to %d classes.' %
              (self.samples, self.num_classes))
        pool.close()
        pool.join()
        self._filepaths = [
            os.path.join(self.directory, fname) for fname in self.filenames
        ]
        super(DirectoryIterator, self).__init__(self.samples, batch_size,
                                                shuffle, seed)
Exemple #40
0
def buildDecisionTree(df,
                      root,
                      file,
                      config,
                      dataset_features,
                      parent_level=0,
                      leaf_id=0,
                      parents='root',
                      tree_id=0,
                      validation_df=None,
                      main_process_id=None):

    models = []

    decision_rules = []

    feature_names = df.columns[0:-1]

    enableParallelism = config['enableParallelism']
    algorithm = config['algorithm']

    json_file = file.split(".")[0] + ".json"

    random_forest_enabled = config['enableRandomForest']
    enableGBM = config['enableGBM']
    enableAdaboost = config['enableAdaboost']

    if root == 1:
        if random_forest_enabled != True and enableGBM != True and enableAdaboost != True:
            raw_df = df.copy()

    #--------------------------------------

    df_copy = df.copy()

    winner_name, num_of_instances, metric, metric_name = findDecision(
        df, config)

    #find winner index, this cannot be returned by find decision because columns dropped in previous steps
    j = 0
    for i in dataset_features:
        if i == winner_name:
            winner_index = j
        j = j + 1

    numericColumn = False
    if dataset_features[winner_name] != 'object':
        numericColumn = True

    #restoration
    columns = df.shape[1]
    for i in range(0, columns - 1):
        #column_name = df.columns[i]; column_type = df[column_name].dtypes #numeric field already transformed to object. you cannot check it with df itself, you should check df_copy
        column_name = df_copy.columns[i]
        column_type = df_copy[column_name].dtypes
        if column_type != 'object' and column_name != winner_name:
            df[column_name] = df_copy[column_name]

    classes = df[winner_name].value_counts().keys().tolist()
    #print("classes: ",classes," in ", winner_name)
    #-----------------------------------------------------

    num_cores = config["num_cores"]

    input_params = []

    #serial approach
    for i in range(0, len(classes)):
        current_class = classes[i]
        subdataset = df[df[winner_name] == current_class]
        subdataset = subdataset.drop(columns=[winner_name])
        branch_index = i * 1

        #create branches serially
        if enableParallelism != True:

            if i == 0:

                descriptor = {
                    "feature": winner_name,
                    "instances": num_of_instances,
                    #"metric_name": metric_name,
                    "metric_value": round(metric, 4),
                    "depth": parent_level + 1
                }
                descriptor = "# " + json.dumps(descriptor)

                functions.storeRule(
                    file, (functions.formatRule(root), "", descriptor))

            results = createBranch(config,
                                   current_class,
                                   subdataset,
                                   numericColumn,
                                   branch_index,
                                   winner_name,
                                   winner_index,
                                   root,
                                   parents,
                                   file,
                                   dataset_features,
                                   num_of_instances,
                                   metric,
                                   tree_id=tree_id,
                                   main_process_id=main_process_id)

            decision_rules = decision_rules + results

        else:
            input_params.append(
                (config, current_class, subdataset, numericColumn,
                 branch_index, winner_name, winner_index, root, parents, file,
                 dataset_features, num_of_instances, metric, tree_id,
                 main_process_id))

    #---------------------------
    #add else condition in the decision tree

    if df.Decision.dtypes == 'object':  #classification
        pivot = pd.DataFrame(subdataset.Decision.value_counts()).reset_index()
        pivot = pivot.rename(columns={
            "Decision": "Instances",
            "index": "Decision"
        })
        pivot = pivot.sort_values(by=["Instances"],
                                  ascending=False).reset_index()

        else_decision = "return '%s'" % (pivot.iloc[0].Decision)

        if enableParallelism != True:
            functions.storeRule(file, (functions.formatRule(root), "else:"))
            functions.storeRule(
                file, (functions.formatRule(root + 1), else_decision))
        else:  #parallelism
            leaf_id = str(uuid.uuid1())

            check_rule = "else: " + else_decision

            sample_rule = {}
            sample_rule["current_level"] = root
            sample_rule["leaf_id"] = leaf_id
            sample_rule["parents"] = parents
            sample_rule["rule"] = check_rule
            sample_rule["feature_idx"] = -1
            sample_rule["feature_name"] = ""
            sample_rule["instances"] = df.shape[0]
            sample_rule["metric"] = 0
            sample_rule["return_statement"] = 0
            sample_rule["tree_id"] = tree_id

            #json to string
            sample_rule = json.dumps(sample_rule)
            decision_rules.append(sample_rule)

    else:  #regression
        else_decision = "return %s" % (subdataset.Decision.mean())

        if enableParallelism != True:
            functions.storeRule(file, (functions.formatRule(root), "else:"))
            functions.storeRule(
                file, (functions.formatRule(root + 1), else_decision))
        else:
            leaf_id = str(uuid.uuid1())

            check_rule = "else: " + else_decision

            sample_rule = {}
            sample_rule["current_level"] = root
            sample_rule["leaf_id"] = leaf_id
            sample_rule["parents"] = parents
            sample_rule["rule"] = check_rule
            sample_rule["tree_id"] = tree_id
            sample_rule["feature_name"] = ""
            sample_rule["instances"] = 0
            sample_rule["metric"] = 0
            sample_rule["return_statement"] = 1

            #json to string
            sample_rule = json.dumps(sample_rule)
            decision_rules.append(sample_rule)

    #---------------------------

    try:
        main_process = psutil.Process(main_process_id)
        children = main_process.children(recursive=True)
        active_processes = len(children) + 1  #plus parent
        #active_processes = len(children)
    except:
        active_processes = 100  #set a large initial value

    results = []
    #create branches in parallel
    if enableParallelism == True:

        required_threads = active_processes + len(classes)

        #if parent_level == 0 and random_forest_enabled != True:
        if main_process_id != None and num_cores >= required_threads:  #len(classes) branches will be run in parallel

            #POOL_SIZE = num_cores
            POOL_SIZE = len(classes)

            #with closing(multiprocessing.Pool(POOL_SIZE)) as pool:
            with closing(MyPool(POOL_SIZE)) as pool:
                funclist = []

                for input_param in input_params:
                    f = pool.apply_async(createBranchWrapper,
                                         [createBranch, input_param])
                    funclist.append(f)

                #all functions registered here

                for f in funclist:
                    branch_results = f.get(timeout=100000)

                    for branch_result in branch_results:
                        results.append(branch_result)

                pool.close()
                pool.terminate()

            #--------------------------------

        else:  #serial
            for input_param in input_params:
                sub_results = createBranchWrapper(createBranch, input_param)
                for sub_result in sub_results:
                    results.append(sub_result)

        #--------------------------------

        decision_rules = decision_rules + results

        #--------------------------------

        if root != 1:  #return children results until the root node
            return decision_rules

    #---------------------------------------------

    if root == 1:

        if enableParallelism == True:

            #custom rules are stored in decision_rules. merge them all in a json file first

            json_rules = "[\n"  #initialize

            file_index = 0
            for custom_rule in decision_rules:

                json_rules += custom_rule

                if file_index < len(decision_rules) - 1:
                    json_rules += ", "

                json_rules += "\n"

                file_index = file_index + 1

            #-----------------------------------

            json_rules += "]"
            functions.createFile(json_file, json_rules)

            #-----------------------------------
            #reconstruct rules from json to py

            reconstructRules(json_file, feature_names)

            #-----------------------------------

        #is regular decision tree
        if config['enableRandomForest'] != True and config[
                'enableGBM'] != True and config['enableAdaboost'] != True:
            #this is reguler decision tree. find accuracy here.

            moduleName = "outputs/rules/rules"
            fp, pathname, description = imp.find_module(moduleName)
            myrules = imp.load_module(moduleName, fp, pathname,
                                      description)  #rules0
            models.append(myrules)

    return models
Exemple #41
0
    def __init__(self, directory, image_data_generator,
                 target_size=(256, 256), batch_size=32,
                 shuffle=True, seed=None,
                 data_format=None,
                 save_to_dir=None, save_prefix='', save_format='png',
                 follow_links=False):
        if data_format is None:
            data_format = K.image_data_format()
        self.directory = directory
        self.image_data_generator = image_data_generator
        self.target_size = tuple(target_size)

        # density maps are always grayscale
        self.color_mode = 'grayscale'
        self.data_format = data_format
        if self.data_format == 'channels_last':
            self.image_shape = self.target_size + (1,)
        else:
            self.image_shape = (1,) + self.target_size

        # class mode is always None
        self.class_mode = None

        self.save_to_dir = save_to_dir
        self.save_prefix = save_prefix
        self.save_format = save_format

        # density maps are stores as npy files
        white_list_formats = {'npy'}

        # first, count the number of samples and classes
        self.samples = 0
        classes = []
        for subdir in sorted(os.listdir(directory)):
            if os.path.isdir(os.path.join(directory, subdir)):
                classes.append(subdir)
        self.num_class = len(classes)
        self.class_indices = dict(zip(classes, range(len(classes))))

        def _recursive_list(subpath):
            return sorted(os.walk(subpath, followlinks=follow_links), key=lambda tpl: tpl[0])

        pool = multiprocessing.pool.ThreadPool()
        function_partial = partial(_count_valid_files_in_directory,
                                   white_list_formats=white_list_formats,
                                   follow_links=follow_links)
        self.samples = sum(pool.map(function_partial,
                                    (os.path.join(directory, subdir)
                                     for subdir in classes)))

        print('Found %d density maps.' % self.samples)

        # second, build an index of the images in the different class subfolders
        results = []

        self.filenames = []
        self.classes = np.zeros((self.samples,), dtype='int32')
        i = 0
        for dirpath in (os.path.join(directory, subdir) for subdir in classes):
            results.append(pool.apply_async(_list_valid_filenames_in_directory,
                                            (dirpath, white_list_formats,
                                             self.class_indices, follow_links)))
        for res in results:
            classes, filenames = res.get()
            self.classes[i:i + len(classes)] = classes
            self.filenames += filenames
            i += len(classes)
        pool.close()
        pool.join()
        super(NpyDirectoryIterator, self).__init__(self.samples, batch_size, shuffle, seed)
Exemple #42
0
    designspaceItalic = fontTools.designspaceLib.DesignSpaceDocument.fromfile(
        INPUT_DIR / "CascadiaCode_variable_italic.designspace")

    designspaceItalic.instances = [
        s for s in designspaceItalic.instances
        if s.lib.get("com.schriftgestaltung.export", True)
    ]

    # Stage 1: Make all the things.
    pool = multiprocessing.pool.Pool(processes=multiprocessing.cpu_count())
    processes = []
    processes.append(
        pool.apply_async(
            build_font_variable,
            (
                designspace,
                "Cascadia Code",
                args.vtt_compile,
            ),
        ))
    if args.italic:
        processes.append(
            pool.apply_async(
                build_font_variable,
                (
                    designspaceItalic,
                    "Cascadia Code Italic",
                    args.vtt_compile,
                ),
            ))
    if args.mono:
        processes.append(
Exemple #43
0
                'batch_%05d_%05d_%d' %
                (frameNumbers[0], frameNumbers[-1], options.bundleLength))
            thisOutputFolder = os.path.join(outputFolder, batchFolderName)

            if not options.logBatches:
                logger.info('Running processing batch in output folder: ' +
                            thisOutputFolder + '\n' + 'with options: ' +
                            extraOptions + ' --stereo-arguments ' +
                            options.stereoArgs)

            if not options.dryRun:
                # Generate the command call
                taskHandles.append(
                    pool.apply_async(
                        processBatch,
                        (batchImageCameraPairs, lidarFolder,
                         options.referenceDem, thisSkipInterval,
                         thisOutputFolder, extraOptions, outputResolution,
                         options.stereoArgs, batchNum, batchLogPath)))
            batchNum += 1

        # Reset these lists
        batchImageCameraPairs = []
        frameNumbers = []

        # Advance to the frame that starts the next batch
        if hitBreakFrame:
            # When we hit a break in the frames we need to start the
            # next batch after the break frame
            i = j + 1
        else:
            # Start in the next frame that was not used as a "left" stereo image.
Exemple #44
0
    def _generate_descriptor_matrices(self, data_set, **kwargs):
        """
        Generate info and descriptor matrices based on ingest type.

        :param data_set: Iterable of data elements to generate combined info
            and descriptor matrices for.
        :type item_iter: collections.Set[smqtk.representation.DataElement]

        :param limit: Limit the number of descriptor entries to this amount.
        :type limit: int

        :return: Combined info and descriptor matrices for all base images
        :rtype: (numpy.core.multiarray.ndarray, numpy.core.multiarray.ndarray)

        """
        if not data_set:
            raise ValueError("No data given to process.")

        inf = float('inf')
        descriptor_limit = kwargs.get('limit', inf)
        per_item_limit = numpy.floor(float(descriptor_limit) / len(data_set))

        if len(data_set) == 1:
            # because an iterable doesn't necessarily have a next() method
            di = iter(data_set).next()
            # Check for checkpoint files
            info_fp, desc_fp = \
                self._get_standard_info_descriptors_filepath(di)
            # Save out data bytes to temporary file
            temp_img_filepath = self._get_data_temp_path(di)
            try:
                # Generate descriptors
                utils.generate_descriptors(self.EXE, temp_img_filepath,
                                           self.descriptor_type(), info_fp,
                                           desc_fp, per_item_limit)
            finally:
                # clean temp file
                di.clean_temp()
            return numpy.load(info_fp), numpy.load(desc_fp)
        else:
            # compute and V-stack matrices for all given images
            pool = multiprocessing.Pool(processes=self.parallel)

            # Mapping of UID to tuple containing:
            #   (info_fp, desc_fp, async processing result, tmp_clean_method)
            r_map = {}
            with SimpleTimer("Computing descriptors async...",
                             self._log.debug):
                for di in data_set:
                    # Creating temporary image file from data bytes
                    tmp_img_fp = self._get_data_temp_path(di)

                    info_fp, desc_fp = \
                        self._get_standard_info_descriptors_filepath(di)
                    args = (self.EXE, tmp_img_fp, self.descriptor_type(),
                            info_fp, desc_fp)
                    r = pool.apply_async(utils.generate_descriptors, args)
                    r_map[di.uuid()] = (info_fp, desc_fp, r, di.clean_temp)
            pool.close()

            # Pass through results from descriptor generation, aggregating
            # matrix shapes.
            # - Transforms r_map into:
            #       UID -> (info_fp, desc_fp, starting_row, SubSampleIndices)
            self._log.debug("Constructing information for super matrices...")
            s_keys = sorted(r_map.keys())
            running_height = 0  # info and desc heights congruent

            i_width = None
            d_width = None

            for uid in s_keys:
                ifp, dfp, r, tmp_clean_method = r_map[uid]

                # descriptor generation may have failed for this ingest UID
                try:
                    i_shape, d_shape = r.get()
                except RuntimeError, ex:
                    self._log.warning(
                        "Descriptor generation failed for "
                        "UID[%s], skipping its inclusion in "
                        "model: %s", uid, str(ex))
                    r_map[uid] = None
                    continue
                finally:
                    # Done with image file, so remove from filesystem
                    tmp_clean_method()
Exemple #45
0
    for div in divs:
        post = get_post(div)
        if post:
            queue.put(post)


def get_queue_contents(queue):
    def get_queue():
        try:
            return queue.get_nowait()
        except:
            return None

    return [x for x in iter(get_queue, None)]


if __name__ == '__main__':
    urls = ['http://www.holycool.net/page/%d' % x for x in range(1, 128)]
    queue = Queue.Queue()
    lock = multiprocessing.Lock()
    pool = multiprocessing.pool.ThreadPool(60)

    for url in urls:
        pool.apply_async(fetch_posts, args=(url, queue, lock))
    pool.close()
    pool.join()

    posts = get_queue_contents(queue)
    with open('scraped.txt', 'w') as f:
        f.write(str(posts))
Exemple #46
0
    def _generate_descriptor_matrices(self, data_set, **kwargs):
        """
        Generate info and descriptor matrices based on ingest type.

        :param data_set: Iterable of data elements to generate combined info
            and descriptor matrices for.
        :type item_iter: collections.Set[smqtk.representation.DataElement]

        :param limit: Limit the number of descriptor entries to this amount.
        :type limit: int

        :return: Combined info and descriptor matrices for all base images
        :rtype: (numpy.core.multiarray.ndarray, numpy.core.multiarray.ndarray)

        """
        descriptor_limit = kwargs.get('limit', float('inf'))
        # With videos, an "item" is one video, so, collect for a while video
        # as normal, then subsample from the full video collection.
        per_item_limit = numpy.floor(float(descriptor_limit) / len(data_set))

        # If an odd number of jobs, favor descriptor extraction
        if self.parallel:
            descr_parallel = int(max(1, math.ceil(self.parallel / 2.0)))
            extract_parallel = int(max(1, math.floor(self.parallel / 2.0)))
        else:
            cpuc = multiprocessing.cpu_count()
            descr_parallel = int(max(1, math.ceil(cpuc / 2.0)))
            extract_parallel = int(max(1, math.floor(cpuc / 2.0)))

        # For each video, extract frames and submit colorDescriptor processing
        # jobs for each frame, combining all results into a single matrix for
        # return.
        pool = multiprocessing.Pool(processes=descr_parallel)

        # Mapping of [UID] to [frame] to tuple containing:
        #   (info_fp, desc_fp, async processing result)
        r_map = {}
        with SimpleTimer("Extracting frames and submitting descriptor jobs...",
                         self._log.debug):
            for di in data_set:
                r_map[di.uuid()] = {}
                tmp_vid_fp = self._get_data_temp_path(di)
                p = dict(self.FRAME_EXTRACTION_PARAMS)
                vmd = get_metadata_info(tmp_vid_fp)
                p['second_offset'] = vmd.duration * p['second_offset']
                p['max_duration'] = vmd.duration * p['max_duration']
                fm = video_utils.ffmpeg_extract_frame_map(
                    self._work_dir, tmp_vid_fp, parallel=extract_parallel, **p)

                # Compute descriptors for extracted frames.
                for frame, imgPath in fm.iteritems():
                    info_fp, desc_fp = \
                        self._get_standard_info_descriptors_filepath(di, frame)
                    r = pool.apply_async(utils.generate_descriptors,
                                         args=(self.EXE, imgPath,
                                               self.descriptor_type(), info_fp,
                                               desc_fp))
                    r_map[di.uuid()][frame] = (info_fp, desc_fp, r)

                # Clean temporary video file file while computing descriptors
                # This does not remove the extracted frames that the underlying
                #   detector/descriptor is working on.
                di.clean_temp()
        pool.close()

        # Each result is a tuple of two ndarrays: info and descriptor matrices
        with SimpleTimer("Collecting shape information for super matrices...",
                         self._log.debug):
            running_height = 0

            i_width = None
            d_width = None

            # Transform r_map[uid] into:
            #   (info_mat_files, desc_mat_files, sR, ssi_list)
            #   -> files in frame order
            uids = sorted(r_map)
            for uid in uids:
                video_num_desc = 0
                video_info_mat_fps = []  # ordered list of frame info mat files
                video_desc_mat_fps = []  # ordered list of frame desc mat files
                for frame in sorted(r_map[uid]):
                    ifp, dfp, r = r_map[uid][frame]

                    # Descriptor generation may have failed for this UID
                    try:
                        i_shape, d_shape = r.get()
                    except RuntimeError, ex:
                        self._log.warning(
                            'Descriptor generation failed for '
                            'frame %d in video UID[%s]: %s', frame, uid,
                            str(ex))
                        r_map[uid] = None
                        continue

                    if d_width is None and d_shape[0] != 0:
                        i_width = i_shape[1]
                        d_width = d_shape[1]

                    # Skip if there were no descriptors generated for this
                    # frame
                    if d_shape[1] == 0:
                        continue

                    video_info_mat_fps.append(ifp)
                    video_desc_mat_fps.append(dfp)
                    video_num_desc += d_shape[0]

                # If combined descriptor height exceeds the per-item limit,
                # generate a random subsample index list
                ssi = None
                if video_num_desc > per_item_limit:
                    ssi = sorted(
                        numpy.random.permutation(video_num_desc)
                        [:per_item_limit])
                    video_num_desc = len(ssi)

                r_map[uid] = (video_info_mat_fps, video_desc_mat_fps,
                              running_height, ssi)
                running_height += video_num_desc
Exemple #47
0
    def __init__(self,
                 directory,
                 classes=None,
                 number_subsequences=32,
                 dim=(32, 32, 32),
                 n_channels=6,
                 n_classes=10,
                 shuffle=True,
                 n_samples=None,
                 seed=None,
                 faster=True,
                 online_training=False):
        'Initialization'
        self.directory = directory
        self.classes = classes
        self.dim = dim
        self.labels = None
        self.list_IDs = None
        self.n_channels = n_channels
        self.shuffle = shuffle
        self.seed = seed
        self.online_training = online_training
        if number_subsequences == 1:
            self.shrink_timesteps = False
        else:
            self.shrink_timesteps = True

        self.number_subsequences = number_subsequences

        if faster == True:
            self.faster = 16
        elif type(faster) == int and faster > 0:
            self.faster = faster
        else:
            self.faster = 1

        self.number_samples_per_batch = self.faster

        self.number_samples_per_class_to_pick = n_samples

        if not classes:
            classes = []
            for subdir in sorted(os.listdir(directory)):
                if os.path.isdir(os.path.join(directory, subdir)):
                    classes.append(subdir)
            self.classes = classes

        self.n_classes = len(classes)
        self.class_indices = dict(zip(classes, range(len(classes))))

        # want a dict which contains dirs and number usable files
        pool = multiprocessing.pool.ThreadPool()
        function_partial = partial(image._count_valid_files_in_directory,
                                   white_list_formats={'csv'},
                                   follow_links=None,
                                   split=None)
        self.samples = pool.map(function_partial,
                                (os.path.join(directory, subdir)
                                 for subdir in classes))
        self.samples = dict(zip(classes, self.samples))

        results = []

        for dirpath in (os.path.join(directory, subdir) for subdir in classes):
            results.append(
                pool.apply_async(
                    image._list_valid_filenames_in_directory,
                    (dirpath, {'csv'}, None, self.class_indices, None)))

        self.filename_dict = {}
        for res in results:
            classes, filenames = res.get()
            for index, class_i in enumerate(classes):
                self.filename_dict.update(
                    {f"{class_i}_{index}": filenames[index]})

        pool.close()
        pool.join()

        if not n_samples:
            self.number_samples_per_class_to_pick = min(self.samples.values())

        self.on_epoch_end()
Exemple #48
0
def run_calls(fun,
              list_of_args,
              extra_args=(),
              pool_type='processes',
              nb_workers=multiprocessing.cpu_count(),
              timeout=60,
              verbose=True,
              initializer=None,
              initargs=None):
    """
    Run a function several times in parallel with different inputs.

    Args:
        fun: function to be called several times in parallel.
        list_of_args: list of (first positional) arguments passed to fun, one
            per call
        extra_args: tuple containing extra arguments to be passed to fun
            (same value for all calls)
        pool_type: either 'processes' or 'threads'
        nb_workers: number of calls run simultaneously
        timeout: number of seconds allowed per function call
        verbose: either True (show the amount of computed calls) or False
        initializer, initargs (optional): if initializer is not None then each
            worker process will call initializer(*initargs) when it starts

    Return:
        list of outputs
    """
    if pool_type == 'processes':
        pool = multiprocessing.Pool(nb_workers, initializer, initargs)
    elif pool_type == 'threads':
        pool = multiprocessing.pool.ThreadPool(nb_workers)
    else:
        raise ValueError("unknow pool_type {}".format(pool_type))

    results = []
    outputs = []

    with contextlib.ExitStack() as stack:
        if verbose:
            bar = stack.enter_context(tqdm(total=len(list_of_args)))

        for x in list_of_args:
            if type(x) == tuple:
                args = x + extra_args
            else:
                args = (x, ) + extra_args
            results.append(
                pool.apply_async(fun,
                                 args=args,
                                 callback=lambda x: bar.update(1)
                                 if verbose else None))

        for r in results:
            try:
                outputs.append(r.get(timeout))
            except KeyboardInterrupt:
                pool.terminate()
                sys.exit(1)

    pool.close()
    pool.join()
    return outputs
Exemple #49
0
def runFBA_multi(inputTar,
                 gem_sbml,
                 outputTar,
                 sim_type,
                 source_reaction,
                 target_reaction,
                 source_coefficient,
                 target_coefficient,
                 is_max,
                 fraction_of,
                 dont_merge=True,
                 num_workers=10,
                 pathway_id='rp_pathway',
                 objective_id=None,
                 compartment_id='MNXC3',
                 fill_orphan_species=False,
                 species_group_id='central_species',
                 sink_species_group_id='rp_sink_species'):
    """Subprocess implementation of rpFBA

    :param inputTar: Path of the TAR rpSBML files
    :param gem_sbml: Path to the GEM file
    :param outputTar: Path of the TAR output
    :param sim_type: The type of simulation to use. Available simulation types include: fraction, fba, rpfba
    :param source_reaction: The reaction id of the source reaction.
    :param target_reaction: The reaction id of the target reaction. Note that if fba or rpfba options are used, then these are ignored
    :param source_coefficient: The source coefficient
    :param target_coefficient: The target coefficient
    :param is_max: Maximise or minimise the objective
    :param fraction_of: The fraction of the optimum. Note that this value is ignored is fba is used
    :param dont_merge: Output the merged model (Default: True)
    :param num_workers: The number of processes to use (Default: 10)
    :param pathway_id: The id of the heterologous pathway (Default: rp_pathway)
    :param objective_id: Overwrite the auto-generated id of the results (Default: None)
    :param compartment_id: The SBML compartment id (Default: MNXC3)
    :param fill_orphan_species: Add pseudo reactions that consume/produce single parent species. Note in development
    :param species_group_id: The id of the central species (Default: central_species)
    :param sink_species_group_id: The id of the sink species (Default: rp_sink_species)

    :type inputTar: str 
    :type gem_sbml: str
    :type outputTar: str 
    :type sim_type: str
    :type source_reaction: str
    :type target_reaction: str
    :type source_coefficient: float
    :type target_coefficient: float
    :type is_max: bool
    :type fraction_of: float
    :type dont_merge: bool
    :type num_workers: int
    :type pathway_id: str
    :type objective_id: str
    :type compartment_id: str
    :type fill_orphan_species: bool
    :type species_group_id: str
    :type sink_species_group_id: str

    :return: Succcess or failure of the function
    :rtype: bool
    """
    with tempfile.TemporaryDirectory() as tmpOutputFolder:
        with tempfile.TemporaryDirectory() as tmpInputFolder:
            tar = tarfile.open(inputTar, mode='r')
            tar.extractall(path=tmpInputFolder)
            tar.close()
            if len(glob.glob(tmpInputFolder + '/*')) == 0:
                logging.error('Input file is empty')
                return False
            #HERE SPECIFY THE NUMBER OF CORES
            pool = nonDeamonicPool(processes=num_workers)
            results = []
            for sbml_path in glob.glob(tmpInputFolder + '/*'):
                file_name = sbml_path.split('/')[-1].replace(
                    '.sbml', '').replace('.xml', '').replace('.rpsbml', '')
                results.append(
                    pool.apply_async(singleFBA_hdd,
                                     args=(
                                         file_name,
                                         sbml_path,
                                         gem_sbml,
                                         sim_type,
                                         source_reaction,
                                         target_reaction,
                                         source_coefficient,
                                         target_coefficient,
                                         is_max,
                                         fraction_of,
                                         tmpOutputFolder,
                                         dont_merge,
                                         pathway_id,
                                         objective_id,
                                         compartment_id,
                                         fill_orphan_species,
                                         species_group_id,
                                         sink_species_group_id,
                                     )))
            output = [p.get() for p in results]
            pool.close()
            pool.join()
            if len(glob.glob(tmpOutputFolder + '/*')) == 0:
                logging.error('rpFBA has not produced any results')
                return False
            with tarfile.open(outputTar, mode='w:gz') as ot:
                for sbml_path in glob.glob(tmpOutputFolder + '/*'):
                    file_name = str(
                        sbml_path.split('/')[-1].replace('.sbml', '').replace(
                            '.xml', '').replace('.rpsbml', '')) + '.sbml.xml'
                    info = tarfile.TarInfo(file_name)
                    info.size = os.path.getsize(sbml_path)
                    ot.addfile(tarinfo=info, fileobj=open(sbml_path, 'rb'))
    return True
def generate(
    distros=None,
    version=None,
    jobs=None,
    publish_under=None,
    generate_docker_tarball=False,
    generate_distro_specific_sct_tarball=False,
    build_options=[],
    proxy=False,
):
    """
	"""

    if distros is None:
        distros = default_distros

    if version is None:
        version = default_version

    logger.info("Generating distro Dockerfiles")
    names = []
    for distro in distros:
        name = "sct-{}-{}".format(version, distro.replace(":", "-")).lower()
        logger.info("- %s...", name)

        lock = threading.Lock(
        )  # prevent building official simultaneously to alias

        name = sct_docker.generate(
            distro=distro,
            version=version,
            name=name,
            commands=default_commands,
            install_fsleyes=True,
            install_tools=True,
            install_python=True,
            #install_fsl=True,
            configure_ssh=True,
            verbose=False,
            proxy=proxy,
        )

        names.append((name, lock))

        if distro == "official":
            name = "sct-{}-{}".format(version, "official").lower()
            logger.info("- %s...", name)

            name = sct_docker.generate(
                distro=official_distro,
                version=version,
                name=name,
                commands=default_commands,
                install_fsleyes=True,
                #install_fsl=True,
                configure_ssh=True,
                verbose=False,
                proxy=proxy,
            )

            names.append((name, lock))

    logger.info("Done generating distro Dockerfiles")

    logger.info("Building images")

    if not check_exe("docker"):
        raise RuntimeError(
            "You might want to have docker available when running this tool")

    pool = multiprocessing.pool.ThreadPool(jobs)

    try:
        res = list()
        for name, lock in names:

            cmd = [
                "docker",
                "build",
                "-t",
                name,
                name,
            ] + build_options

            def docker_build(cmd, lock):
                with lock:
                    return subprocess.call(cmd)

            promise = pool.apply_async(docker_build, (cmd, lock))
            res.append(promise)

        errs = list()
        for (name, _), promise in zip(names, res):
            err = promise.get()
            if err != 0:
                logger.error("{} failed with error code {}".format(name, err))
            errs.append(err)

        pool.close()
    finally:
        pool.terminate()
    pool.join()
    logger.info("Done building images")

    failed = False
    for (name, _), err in zip(names, errs):
        if err == 0:
            logger.info("{} finished successfully".format(name))
        else:
            logger.error("{} failed with error code {}".format(name, err))
            failed = True

    if failed:
        logger.error("Not proceeding further as one distro failed: %s", errs)
        raise RuntimeError("Failed generating one distro")

    if proxy:
        return

    if publish_under:
        logger.info("Publishing on Docker hub")
        for name, _ in names:
            logger.info("- %s...", name)
            cmd = ["docker", "tag", name, "{}:{}".format(publish_under, name)]
            subprocess.call(cmd)
            cmd = ["docker", "push", "{}:{}".format(publish_under, name)]
            subprocess.call(cmd)

        logger.info("Done publishing")

    if generate_docker_tarball:
        logger.info("Generating Docker tarballs")
        for name, _ in names:
            logger.info("- %s...", name)
            cmd = ["bash", "-c", "docker save {}" \
             " | xz --threads=0 --best > {}-docker.tar.xz".format(name, name)]
            subprocess.call(cmd)
        logger.info("Done generating Docker tarballs")

    if generate_distro_specific_sct_tarball:

        logger.info("Generating offline archives")
        if not (check_exe("xz") and check_exe("bash")):
            raise RuntimeError(
                "You might want to have bash & xz available when running this tool"
            )
        for name, _ in names:
            logger.info("- %s...", name)
            cmd = ["bash", "-c", "docker run --log-driver=none --entrypoint /bin/sh {} -c 'cd /home/sct; tar c sct_*'" \
             " | xz --threads=0 --best > {}-offline.tar.xz".format(name, name)]
            subprocess.call(cmd)

        logger.info("Done generating offline archives")
def Classification(MalwareCorpus, GoodwareCorpus, MaltestCorpus,
                   GoodtestCorpus, Extn):

    if 'datatxt' in Extn:
        Type = 'Drebin'
    elif 'WL2' in Extn:
        Type = 'WLK'
    elif '_pkg_adicfg_ret_.json.ADG.DirWLWODup' in Extn:
        Type = 'CWLK'
    else:
        Type = 'Other'

    # step 1 - split all samples to training set and test set
    logger.debug("Loading positive and negative samples file basename")

    TrainMalSamples = GetFilesWithExtn(MalwareCorpus, Extn)
    TrainGoodSamples = GetFilesWithExtn(GoodwareCorpus,
                                        Extn)[:len(TrainMalSamples)]
    TestMalSamples = GetFilesWithExtn(MaltestCorpus, Extn)
    TestGoodSamples = GetFilesWithExtn(GoodtestCorpus,
                                       Extn)[:len(TestMalSamples)]
    logger.info("All Samples loaded")
    print '# mal train samples:', len(TrainMalSamples)
    print '# good train samples:', len(TrainGoodSamples)
    print '# mal test samples:', len(TestMalSamples)
    print '# good test samples:', len(TestGoodSamples)

    logger.info("Training and test sets split finished")

    TrainMalLabels = np.ones(len(TrainMalSamples)).tolist()
    TestMalLabels = np.ones(len(TestMalSamples)).tolist()
    TrainGoodLabels = np.empty(len(TrainGoodSamples))
    TrainGoodLabels.fill(-1)
    TrainGoodLabels = TrainGoodLabels.tolist()
    TestGoodLabels = np.ones(len(TestGoodSamples))
    TestGoodLabels.fill(-1)
    TestGoodLabels = TestGoodLabels.tolist()
    logger.info("All labels created")

    TrainSamples = TrainMalSamples + TrainGoodSamples
    TestSamples = TestMalSamples + TestGoodSamples
    TrainLabels = TrainMalLabels + TrainGoodLabels
    TestLabels = TestMalLabels + TestGoodLabels
    NumTestMalSamples = len(TestMalLabels)

    logger.info("All Samples loaded into training and testing sets")
    print "# Train Samples", len(TrainSamples)
    print "# Train Labels", len(TrainLabels)
    print "# Test Samples", len(TestSamples)
    print "# Test Labels", len(TestLabels)

    # step 2 - feature extracting
    TFIDFTransformer = TfidfTransformer()

    NewLineCVetorizer = CountVectorizer(input=u'filename',
                                        lowercase=True,
                                        token_pattern=None,
                                        tokenizer=NewLineTokenizer,
                                        dtype=np.float64)

    print 'performing count vectorizing'
    TrainDocsTermsFVs = NewLineCVetorizer.fit_transform(TrainSamples)
    TestDocsTermsFVs = NewLineCVetorizer.transform(TestSamples)
    print 'performing tf-idf vectorizing'
    TrainFVs = TFIDFTransformer.fit_transform(TrainDocsTermsFVs)
    TestFVs = TFIDFTransformer.transform(TestDocsTermsFVs)

    print 'train term-doc matrix: ', TrainFVs.shape  #rowsx cols, rows = docs, cols = features/terms
    print 'test term-doc matrix: ', TestFVs.shape

    # step 3 - classification
    logger.info("Performing Cross Validation")

    EtaList = [0, 0.1, 0.3, 0.5, 0.7, 0.9, 1]
    CWAccuracyList = []

    CList = [0.001, 0.01, 0.1, 1, 10, 100, 1000]
    AROWAccuracyList = []

    pool = MyPool(4)

    a = [
        pool.apply_async(GridSearchCV, (
            MCWVarDiag,
            5,
            e,
            len(TrainSamples),
            TrainFVs,
            TrainLabels,
        )) for e in EtaList
    ]
    CWAccuracyList = [res.get() for res in a]
    EtaBest = EtaList[CWAccuracyList.index(max(CWAccuracyList))]
    BestModel_CW = MCWVarDiag(EtaBest, epochs=50)

    a = [
        pool.apply_async(GridSearchCV,
                         args=(
                             ArowDiag,
                             5,
                             c,
                             len(TrainSamples),
                             TrainFVs,
                             TrainLabels,
                         )) for c in CList
    ]
    AROWAccuracyList = [res.get() for res in a]
    CBest = CList[AROWAccuracyList.index(max(AROWAccuracyList))]
    BestModel_AROW = ArowDiag(CBest, n_iters=50)
    pool.close()
    pool.join()

    print 'best model', BestModel_CW, max(CWAccuracyList)
    print 'best model', BestModel_AROW, max(AROWAccuracyList)

    logger.info("Applying Best Model on Testing Set")

    modeldict = {BestModel_CW: 'CW', BestModel_AROW: 'AROW'}
    for Model in [BestModel_CW, BestModel_AROW]:
        T0 = time()
        f = open(modeldict[Model] + '_' + Type + '.txt', 'w')
        f1 = open(modeldict[Model] + '_' + Type + '_Metadata.txt', 'w')

        Model.fit(TrainFVs, TrainLabels)

        PredictedLabels = []
        NewTestLabels = []
        i = 0
        for TestFV, TestLabel in zip(TestFVs, TestLabels):
            #Mal Sample
            if i < NumTestMalSamples:
                TestMalLabel = np.array([TestLabel])
                PredictedLabel = Model.predict(TestFV)
                PredictedLabels.append(float(PredictedLabel))
                NewTestLabels.append(TestLabel)
                if float(PredictedLabel) != TestLabel:
                    try:
                        Model.partial_fit(TestFV, TestLabel)  #update the model
                        logger.info("Model Partially Fitted")
                    except:
                        logger.error("Partially Fitted Failed")
                        pass
                PredictedMalLabel = np.array([float(PredictedLabel)])
                print >> f1, (metrics.classification_report(
                    TestMalLabel,
                    PredictedMalLabel,
                    target_names=['Sample', 'Sample']))
                print >> f1, "Zero-one classification loss:", metrics.zero_one_loss(
                    TestMalLabel, PredictedMalLabel)
                print >> f1, '-' * 100
            #Ben Sample
            if NumTestMalSamples + i < len(TestLabels):
                TestLabel = TestLabels[NumTestMalSamples + i]
                TestFV = TestFVs[NumTestMalSamples + i]
                TestGoodLabel = np.array([TestLabel])
                PredictedLabel2 = Model.predict(TestFV)
                PredictedLabels.append(float(PredictedLabel2))
                NewTestLabels.append(TestLabel)
                if float(PredictedLabel2) != TestLabel:
                    try:
                        Model.partial_fit(TestFVs[NumTestMalSamples + i],
                                          TestLabel)  #update the model
                        logger.info("Model Partially Fitted")
                    except:
                        logger.error("Partially Fitted Failed")
                        pass
                PredictedGoodLabel = np.array([float(PredictedLabel2)])
                print >> f1, (metrics.classification_report(
                    TestGoodLabel,
                    PredictedGoodLabel,
                    target_names=['Sample', 'Sample']))
                print >> f1, "Zero-one classification loss:", metrics.zero_one_loss(
                    TestGoodLabel, PredictedGoodLabel)
                print >> f1, '-' * 100
            i += 1

        if modeldict[Model] == 'CW':
            print >> f, 'Best Eta parameter', EtaBest
        elif modeldict[Model] == 'AROW':
            print >> f, 'Best C parameter', CBest
        print >> f, '-' * 100
        print >> f, '-' * 43 + 'Whole Database' + '-' * 43
        Accuracy = metrics.accuracy_score(PredictedLabels, NewTestLabels)
        print >> f, "Test Set Accuracy = ", Accuracy
        print >> f, 'testing time', time() - T0
        print >> f, (metrics.classification_report(
            NewTestLabels,
            PredictedLabels,
            target_names=['Goodware', 'Malware']))  # raw_input()

        print >> f, 'Classifier Top Features'
        print >> f, '-' * 100
        Vocab = NewLineCVetorizer.get_feature_names()
        try:
            FeautureImportances = Model.model["mu"][1.0].toarray()[0][:-1]
        except:
            FeautureImportances = Model.model["mu"].toarray()[0]
        TopFeatureIndices = FeautureImportances.argsort()[-100:][::-1]
        for FIndex in TopFeatureIndices:
            print >> f, Vocab[FIndex], FeautureImportances[FIndex]
        print >> f, '-' * 100

        print >> f, 'before deleting rows TestFVs.shape', TestFVs.shape
        for i in xrange(len(TestSamples)):
            if -1 == TestLabels[i]:
                TestFVss = TestFVs[:i, :]
                break
        print >> f, 'after deleting rows TestFVs.shape', TestFVss.shape

        FeatureImportancesSparseArray = ssp.lil_matrix(
            (TestFVss.shape[1], TestFVss.shape[1]))
        FeatureImportancesSparseArray.setdiag(FeautureImportances)
        AllFVsTimesW = TestFVss * FeatureImportancesSparseArray

        print >> f, '-' * 100
        AvgFV = AllFVsTimesW.mean(axis=0)
        AvgFV = AvgFV.view(dtype=np.float64).reshape(AvgFV.shape[1], -1)
        AvgFV = np.array(AvgFV).reshape(-1, )
        TopRes = AvgFV.argsort()[-100:][::-1]
        print >> f, 'Top Feats of Test Positive Vector * Feature Importance Vector'
        for Sindex in TopRes:
            print >> f, Vocab[Sindex], AvgFV[Sindex]
        print >> f, '-' * 100
    def __init__(self,
                 directory,
                 image_data_generator,
                 target_size=(256, 256),
                 color_mode='rgb',
                 classes=None,
                 class_mode='categorical',
                 batch_size=32,
                 shuffle=True,
                 seed=None,
                 data_format='channels_last',
                 save_to_dir=None,
                 save_prefix='',
                 save_format='png',
                 follow_links=False,
                 subset=None,
                 interpolation='nearest',
                 dtype='float32'):
        super(DirectoryIterator, self).set_processing_attrs(image_data_generator,
                                                            target_size,
                                                            color_mode,
                                                            data_format,
                                                            save_to_dir,
                                                            save_prefix,
                                                            save_format,
                                                            subset,
                                                            interpolation)
        self.directory = directory
        self.classes = classes
        if class_mode not in self.allowed_class_modes:
            raise ValueError('Invalid class_mode: {}; expected one of: {}'
                             .format(class_mode, self.allowed_class_modes))
        self.class_mode = class_mode
        self.dtype = dtype
        # First, count the number of samples and classes.
        self.samples = 0

        if not classes:
            classes = []
            for subdir in sorted(os.listdir(directory)):
                if os.path.isdir(os.path.join(directory, subdir)):
                    classes.append(subdir)
        self.num_classes = len(classes)
        self.class_indices = dict(zip(classes, range(len(classes))))

        pool = multiprocessing.pool.ThreadPool()

        # Second, build an index of the images
        # in the different class subfolders.
        results = []
        self.filenames = []
        i = 0
        for dirpath in (os.path.join(directory, subdir) for subdir in classes):
            results.append(
                pool.apply_async(_list_valid_filenames_in_directory,
                                 (dirpath, self.white_list_formats, self.split,
                                  self.class_indices, follow_links)))
        classes_list = []
        for res in results:
            classes, filenames = res.get()
            classes_list.append(classes)
            self.filenames += filenames
        self.samples = len(self.filenames)
        self.classes = np.zeros((self.samples,), dtype='int32')
        for classes in classes_list:
            self.classes[i:i + len(classes)] = classes
            i += len(classes)

        print('Found %d images belonging to %d classes.' %
              (self.samples, self.num_classes))
        pool.close()
        pool.join()
        super(DirectoryIterator, self).__init__(self.samples,
                                                batch_size,
                                                shuffle,
                                                seed)
Exemple #53
0
def run_function_different_arguments_parallel(function,
                                              arguments,
                                              parallel=True,
                                              all_success=True,
                                              signal=None,
                                              use_thread=False,
                                              *args,
                                              **kwargs):
    """
    Call functions in parallel
    :param function: f(argument, **kwargs)
    :param arguments: {i: argument}
    :param all_success: (boolean) the function will raise an exception if one of the runs
        fail and all_success is True
    :param signal: (function) calls this function after generating the jobs. It's used to test
        KeyboardInterrupt, and the signal is a mock of KeyboardInterrupt.
    :param parallel: (boolean) The code is run in parallel only if it's True.
    :param threads: (int) Uses threads instead of processes if threads > 0
    :param args: additional arguments of function
    :param kwargs: additional arguments of function
    :return: {int: output of f(arguments[i])}
    """
    # Maybe later we enable this feature.
    #thread = False
    if not parallel:
        results = {}
        for key, argument in arguments.items():
            _args = (argument, ) + args
            results[key] = function(*_args, **kwargs)
        return results
    else:
        jobs = {}

        n_jobs = min(len(arguments), mp.cpu_count())

        if use_thread:
            threads = len(arguments)
            pool = ThreadPool(threads)
        else:
            pool = mp.Pool(processes=n_jobs)

        try:
            for key, argument in arguments.items():
                job = pool.apply_async(function,
                                       args=(argument, ) + args,
                                       kwds=kwargs)
                jobs[key] = job
            pool.close()
            pool.join()
            if signal is not None:
                signal(1)
        except KeyboardInterrupt:
            print("Ctrl+c received, terminating and joining pool.")
            pool.terminate()
            pool.join()
            return -1

        results = {}
        n_retry = 5
        for key in arguments.keys():
            for count in range(n_retry):  # retry 5 times before raise error.
                try:
                    results[key] = jobs[key].get()
                    break
                except Exception as e:
                    # if all_success:
                    #     raise e
                    if count == n_retry - 1:
                        raise e
                    else:
                        print("job failed")
                        print(argument)
                        print(e)
                        print(args)
                        print(kwargs)
                        print('Retrying ...')
        return results
 def func_wrapper(*args, **kwargs):
     """Closure for function."""
     pool = multiprocessing.pool.ThreadPool(processes=1)
     async_result = pool.apply_async(item, args, kwargs)
     # raises a TimeoutError if execution exceeds max_timeout
     return async_result.get(max_timeout)
Exemple #55
0
def train(current_time, loaded_version):
    """ Train the models using the data generated by the self-play """

    last_id = 0
    total_ite = 0
    lr = LR
    version = 1
    pool = False
    criterion = AlphaLoss()
    dataset = SelfPlayDataset()

    ## Database connection
    client = MongoClient()
    collection = client.superGo[current_time]

    ## First player either from disk or fresh
    if loaded_version:
        player, checkpoint = load_player(current_time, loaded_version)
        optimizer = create_optimizer(player, lr, param=checkpoint['optimizer'])
        total_ite = checkpoint['total_ite']
        lr = checkpoint['lr']
        version = checkpoint['version']
        last_id = collection.find().count() - (MOVES // MOVE_LIMIT) * 2
        #last_id = collection.find().count() - 1
    else:
        player = Player()
        optimizer = create_optimizer(player, lr)
        state = create_state(version, lr, total_ite, optimizer)
        player.save_models(state, current_time)
    best_player = deepcopy(player)

    ## Callback after the evaluation is done, must be a closure
    def new_agent(result):
        if result:
            nonlocal version, pending_player, current_time, \
                    lr, total_ite, best_player
            version += 1
            state = create_state(version, lr, total_ite, optimizer)
            best_player = pending_player
            pending_player.save_models(state, current_time)
            print("[EVALUATION] New best player saved !")
        else:
            nonlocal last_id
            ## Force a new fetch in case the player didnt improve
            last_id = fetch_new_games(collection, dataset, last_id)

    ## Wait before the circular before is full
    while len(dataset) < MOVES:
        last_id = fetch_new_games(collection,
                                  dataset,
                                  last_id,
                                  loaded_version=loaded_version)
        time.sleep(30)

    print("[TRAIN] Circular buffer full !")
    print("[TRAIN] Starting to train !")
    dataloader = DataLoader(dataset, collate_fn=collate_fn, \
                            batch_size=BATCH_SIZE, shuffle=True)

    while True:
        batch_loss = []
        for batch_idx, (state, move, winner) in enumerate(dataloader):
            running_loss = []
            lr, optimizer = update_lr(lr, optimizer, total_ite)

            ## Evaluate a copy of the current network asynchronously
            if total_ite % TRAIN_STEPS == 0:
                if (pool):
                    pending_player = deepcopy(player)
                    last_id = fetch_new_games(collection, dataset, last_id)

                    ## Wait in case an evaluation is still going on
                    # if pool:
                    #     print("[EVALUATION] Waiting for eval to end before re-eval")
                    #     pool.close()
                    #     pool.join()
                    pool = MyPool(1)
                    try:
                        pool.apply_async(evaluate, args=(pending_player, best_player), \
                                callback=new_agent)
                        pool.close()
                        pool.join()
                    except Exception as e:
                        client.close()
                        pool.terminate()
                pool = True

            example = {'state': state, 'winner': winner, 'move': move}
            loss = train_epoch(player, optimizer, example, criterion)
            running_loss.append(loss)

            ## Print running loss
            if total_ite % LOSS_TICK == 0:
                print("[TRAIN] current iteration: %d, averaged loss: %.3f"\
                        % (total_ite, np.mean(running_loss)))
                batch_loss.append(np.mean(running_loss))
                running_loss = []

            ## Fetch new games
            if total_ite % REFRESH_TICK == 0:
                last_id = fetch_new_games(collection, dataset, last_id)

            total_ite += 1

        if len(batch_loss) > 0:
            print("[TRAIN] Average backward pass loss : %.3f, current lr: %f" %
                  (np.mean(batch_loss), lr))
 def func_wrapper(*args, **kwargs):
     pool = multiprocessing.pool.ThreadPool(processes=1)
     async_result = pool.apply_async(item, args, kwargs)
     return async_result.get(max_timeout)
Exemple #57
0
def func1(name):
    print(f"当前进程的ID:{os.getpid()}, {name}")
    sleep(2)
    return name


def func2(args):
    print(args)


if __name__ == "__main__":
    # 创建5个进程的进程池
    pool = Pool(5)

    pool.apply_async(func=func1, args=("sxt1", ), callback=func2)
    pool.apply_async(func=func1, args=("sxt2", ), callback=func2)
    pool.apply_async(func=func1, args=("sxt3", ), callback=func2)
    pool.apply_async(func=func1, args=("sxt4", ))
    pool.apply_async(func=func1, args=("sxt5", ))
    pool.apply_async(func=func1, args=("sxt6", ))
    pool.apply_async(func=func1, args=("sxt7", ))
    pool.apply_async(func=func1, args=("sxt8", ))

    # 关闭进程池
    pool.close()

    # 回收进程池
    pool.join()

x = "The are %d types of people." % 10
Exemple #58
0
    def parallel_get(self, urls: List[str]) -> List[Response]:
        """GET multiple URLs in parallel."""

        # FIXME doesn't respect timing() and other object properties

        urls = decode_object_from_bytes_if_needed(urls)

        # Original implementation didn't raise on undefined / empty list of URLs
        if urls is None:
            return []
        if len(urls) == 0:
            return []

        # Remove duplicates from list while maintaining order because:
        # 1) We don't want to fetch the same URL twice
        # 2) URLs are being used as unique dictionary IDs later on
        urls_before_removing_duplicates = urls.copy()
        urls = list(OrderedDict.fromkeys(urls))
        if len(urls) != len(urls_before_removing_duplicates):
            log.warning("Some of the URLs are duplicate; URLs: %s" %
                        str(urls_before_removing_duplicates))

        # Raise on one or more invalid URLs because we consider it a caller's problem; if URL at least looks valid,
        # get() in a fork should be able to come up with a reasonable Response object for it
        for url in urls:
            if not is_http_url(url):
                raise McParallelGetException(
                    "URL %s is not a valid URL; URLs: %s" % (
                        url,
                        str(urls),
                    ))

        num_parallel = self._user_agent_config.parallel_get_num_parallel()
        timeout = self._user_agent_config.parallel_get_timeout()
        per_domain_timeout = self._user_agent_config.parallel_get_per_domain_timeout(
        )

        url_stack = UserAgent.__get_scheduled_urls(
            urls_=urls, per_domain_timeout_=per_domain_timeout)

        start_time = time.time()

        url_blocks = {}
        while len(url_stack) > 0:
            block_i = len(url_stack) % num_parallel

            if block_i not in url_blocks:
                url_blocks[block_i] = []

            url_blocks[block_i].append(url_stack.pop())

        # Using ThreadPool instead of Pool because this sometimes gets called from a Celery worker, and if it does,
        # it might fail with:
        #
        # Traceback (most recent call last):
        #   File "/opt/mediacloud/src/common/python/mediawords/util/web/user_agent/__init__.py", line 505, in parallel_get
        #     pool = multiprocessing.Pool(processes=num_parallel)
        #   File "/usr/lib/python3.7/multiprocessing/context.py", line 119, in Pool
        #     context=self.get_context())
        #   File "/usr/lib/python3.7/multiprocessing/pool.py", line 176, in __init__
        #     self._repopulate_pool()
        #   File "/usr/lib/python3.7/multiprocessing/pool.py", line 241, in _repopulate_pool
        #     w.start()
        #   File "/usr/lib/python3.7/multiprocessing/process.py", line 110, in start
        #     'daemonic processes are not allowed to have children'
        # AssertionError: daemonic processes are not allowed to have children
        #
        pool = multiprocessing.pool.ThreadPool(processes=num_parallel)

        all_results = []
        for i, url_block in url_blocks.items():
            result = pool.apply_async(_parallel_get_web_store,
                                      args=(
                                          url_block,
                                          start_time,
                                          timeout,
                                      ))
            all_results.append(result)

        all_responses = []
        for result in all_results:
            responses = result.get()
            all_responses = all_responses + responses

        # No timeouts here because we trust the workers to timeout by themselves (by UserAgent)
        pool.close()
        pool.join()
        pool.terminate()

        # Sort URLs in parameter order
        # (if URLs weren't split into blocks, we could probably use map_async)
        response_url_map = {}
        for response in all_responses:
            url = response.scheduled_url.url
            response_url_map[url] = response.response

        sorted_responses = []
        for url in urls:
            if url not in response_url_map:
                raise McParallelGetException(
                    "URL %s is not in the response URL map %s." % (
                        url,
                        response_url_map,
                    ))

            sorted_responses.append(response_url_map[url])

        if len(urls) != len(sorted_responses):
            raise McParallelGetException(
                "Response count doesn't match URL count; responses: %s; URLs: %s"
                % (
                    sorted_responses,
                    urls,
                ))

        return sorted_responses
Exemple #59
0
    def __init__(self, directory,
                 image_data_generator=None,
                 target_size=(256, 256),
                 color_mode='rgb',
                 classes=None,
                 class_mode='categorical',
                 batch_size=32,
                 shuffle=True,
                 seed=None,
                 data_format=None,
                 save_to_dir=None,
                 save_prefix='',
                 save_format='png',
                 followlinks=False,
                 image_ext_list=config.IMAGE_EXTENSIONS):
        if data_format is None:
            data_format = K.image_data_format()
        self.directory = directory
        self.image_data_generator = image_data_generator
        self.target_size = tuple(target_size)
        if color_mode not in {'rgb', 'grayscale'}:
            raise ValueError('Invalid color mode:', color_mode,
                             '; expected "rgb" or "grayscale".')
        self.color_mode = color_mode
        self.data_format = data_format
        if class_mode not in {'categorical', 'binary', 'sparse',
                              'input', None}:
            raise ValueError('Invalid class_mode:', class_mode,
                             '; expected one of "categorical", '
                             '"binary", "sparse", "input"'
                             ' or None.')
        self.class_mode = class_mode
        self.save_to_dir = save_to_dir
        self.save_prefix = save_prefix
        self.save_format = save_format

        # first, count the number of samples and classes
        self.samples = 0

        if not classes:
            classes = []
            for subdir in sorted(os.listdir(directory)):
                if os.path.isdir(os.path.join(directory, subdir)):
                    classes.append(subdir)
        self.num_class = len(classes)
        self.class_indices = dict(zip(classes, range(len(classes))))

        pool = multiprocessing.pool.ThreadPool()
        function_partial = partial(_count_valid_files_in_directory,
                                   white_list_formats=image_ext_list,
                                   follow_links=followlinks)
        self.samples = sum(pool.map(function_partial,
                                    (os.path.join(directory, subdir)
                                     for subdir in classes)))

        print('Found {0} images belonging to {1} classes.'.format(
            self.samples, self.num_class))

        # second, build an index of the images in different class subfolders
        results = []
        self.filepaths = []
        self.classes = np.zeros((self.samples,), dtype='int32')
        i = 0
        for dirpath in (os.path.join(directory, subdir) for subdir in classes):
            results.append(pool.apply_async(list_valid_filepaths_in_directory,
                                            (dirpath, image_ext_list,
                                             self.class_indices,
                                             followlinks)))
        for res in results:
            classes, filepaths = res.get()
            self.classes[i:i + len(classes)] = classes
            self.filepaths += filepaths
            i += len(classes)
        self.filepaths = np.array(self.filepaths)
        pool.close()
        pool.join()
        super(DirIterator, self).__init__(self.samples, batch_size,
                                          shuffle, seed)
Exemple #60
0
def minimize(p0, pop_size=2, generations=2, processes=4):
    crossover_probability = 0.02
    mutation_probability = 0.1
    new_probability = 0.75
    pop = []
    pop.append(Individual(p0, clone=True))
    for i in range(pop_size):
        pop.append(Individual(p0))

    # Evaluate the entire population
    pool = MyPool(processes=processes)
    jobs = []
    for individual in pop:
        inputs = (individual.p, individual.ssn)
        jobs.append(pool.apply_async(evaluate, args=inputs))
    pool.close()
    pool.join()

    for job, individual in zip(jobs, pop):
        fitness, scalar = job.get()
        individual.setFitness(fitness)
        individual.setScalar(scalar)

    for gen in range(generations):
        offspring = []
        # create a new population member and mix it with the best
        if np.random.rand() < new_probability:
            new_member = Individual(pop[0].p)
            offspring.append(new_member)
            child1, child2, changed = new_member.crossover(pop[0])
            if changed:
                offspring.append(child1)
                offspring.append(child2)

        # iterate over each individual in the population
        for i, individual in enumerate(pop):
            if np.random.rand() < crossover_probability:
                # Crossover with a random, non-identical partner
                partner = np.random.randint(len(pop))
                while partner == i:
                    partner = np.random.randint(len(pop))
                child1, child2, changed = individual.crossover(pop[partner])
                if changed:
                    offspring.append(child1)
                    offspring.append(child2)
            if np.random.rand() < mutation_probability:
                # Create a mutant
                mutant = individual.clone_self()
                if mutant.mutate():
                    offspring.append(mutant)

        # Evaluate the offspring
        if len(offspring) > 0:
            pool = MyPool(processes=processes)
            jobs = []
            for individual in offspring:
                inputs = (individual.p, individual.ssn)
                jobs.append(pool.apply_async(evaluate, inputs))
            pool.close()
            pool.join()
            for job, individual in zip(jobs, offspring):
                fitness, scalar = job.get()
                individual.setFitness(fitness)
                individual.setScalar(scalar)
        new_pop = pop + offspring

        def find_value(ind):
            return ind.fitness
        new_pop.sort(key=find_value)
        '''
        # Ensure Genetic Diversity! - Because they are already sorted, we only
        #                             need to compare neighbors
        previous_p = p0[:-1] * 0.00 # Exclude the scalar
        for individual in new_pop:
            this_p = individual.p[:-1]  # Exclude the scalar
            if (this_p == previous_p).all():
                new_pop.remove(individual)
            else:
                previous_p ==  this_p
        '''
        pop = new_pop[:pop_size]
    return pop