def initialize(): # Don't use "[] * 4" operator since it does not deep copy the pairs but rather # copy 4 times the pointers to the same object! return [[GridShmProxy.StateProxy.initialize(), mp.Value('d', 0, lock=True)], [GridShmProxy.StateProxy.initialize(), mp.Value('d', 0, lock=True)], [GridShmProxy.StateProxy.initialize(), mp.Value('d', 0, lock=True)], [GridShmProxy.StateProxy.initialize(), mp.Value('d', 0, lock=True)]]
def __init__(self, domain_factory, shm_proxy, lambdas=None, nb_domains=os.cpu_count(), ipc_notify=False): super().__init__(domain_factory, lambdas, nb_domains, ipc_notify) self._activations = [ mp.Value('b', False, lock=True) for i in range(nb_domains) ] self._dones = [ mp.Value('b', False, lock=True) for i in range(nb_domains) ] self._conditions = [mp.Condition() for i in range(nb_domains)] self._shm_proxy = shm_proxy self._shm_registers = { } # Maps from registered method parameter types to vectorized array ranges self._shm_types = {} # Maps from register index to type self._shm_sizes = { } # Maps from register method parameter types to number of arrays encoding each type self._shm_arrays = [] # Methods' vectorized parameters self._rsize = 0 # Total size of the register (updated below) self._shm_lambdas = [None] * nb_domains # Vectorized lambdas' ids self._shm_names = [None] * nb_domains # Vectorized methods' names self._shm_params = [ None ] * nb_domains # Indices of methods' vectorized parameters for i in range(nb_domains): j = 0 for r in shm_proxy.register(): for k in range(r[1]): m = shm_proxy.initialize(r[0]) if type(m) == list or type(m) == tuple: if i == 0 and k == 0: # do it once for all the domains and redundant initializers self._shm_sizes[r[0].__name__] = len(m) self._shm_registers[r[0].__name__] = ( j, j + (r[1] * len(m))) self._shm_types.update({ kk: r[0] for kk in range(j, j + (r[1] * len(m)), len(m)) }) self._rsize += (r[1] * len(m)) self._shm_arrays.extend(m) j += len(m) else: if i == 0 and k == 0: # do it once for all the domains and redundant initializers self._shm_sizes[r[0].__name__] = 1 self._shm_registers[r[0].__name__] = (j, j + r[1]) self._shm_types.update( {kk: r[0] for kk in range(j, j + r[1])}) self._rsize += r[1] self._shm_arrays.append(m) j += 1 self._shm_lambdas[i] = mp.Value('i', -1, lock=True) self._shm_names[i] = mp.Array('c', bytearray(100)) self._shm_params[i] = mp.Array( 'i', [-1] * sum(r[1] for r in shm_proxy.register())) logger.info(rf'Using {nb_domains} parallel shared memory domains')
def start(self): max_queue_size = 1 if self.ordered else self.max_queue_size // 2 self.queue = multip.Queue( maxsize=max_queue_size) if self.multiprocess else Queue.Queue( maxsize=self.max_queue_size) # Flag used for keeping values in completed queue in order self.last_completed_job = multip.Value('i', -1) self.exit = multip.Event() if self.multiprocess and self.ordered: self.cache_queue = Queue.Queue(maxsize=self.max_queue_size) def batcher(queue, cache_queue): while not self.exit.is_set(): job_index, item = queue.get() cache_queue.put((job_index, item)) time.sleep(0.0001) #to be sure.. # As Queues in Python are __!__NOT__!__ First in first out in a multiprocessing setting # We use a seperate thread to synchronously put them in order p = Thread(target=batcher, args=(self.queue, self.cache_queue), name='Synchronous batcher worker') p.daemon = True p.start() else: self.cache_queue = self.queue # Start worker processes or threads for i in xrange(self.n_producers): name = "ContinuousParallelBatchIterator worker {0}".format(i) if self.multiprocess: p = multip.Process(target=_produce_helper, args=(i, self.generator, self.job_queue, self.queue, self.last_completed_job, self.ordered, self.exit), name=name) else: p = Thread(target=_produce_helper, args=(i, self.generator, self.job_queue, self.queue, self.last_completed_job, self.ordered, self.exit), name=name) # Make the process daemon, so the main process can die without these finishing p.daemon = True p.start() self.started = True
def gen_programs(program_len, num_programs, args): """ Generates the specified amount of programs of the given length. These are the exact steps performed: 1. Generate <num_programs> programs using gen_program_worker in a process pool 2. Generate examples for each program by executing gen_examples_worker in a process pool. Discard programs for which the required amount of examples could not be generated. 3. Return a dictionary of the form {program: examples} """ progress_counter = multiprocessing.Value('i', 0) gen_prog_pool = multiprocessing.Pool(processes=args.num_workers, initializer=init_gen_prog_worker, initargs=(progress_counter, num_programs, program_len)) input_type_combinations = get_input_type_combinations(params.num_inputs) programs = gen_prog_pool.map(gen_program_worker, input_type_combinations) print('') # Flatten programs = [item for sublist in programs for item in sublist] programs = list(set(programs)) # Generate examples and filter out null programs progress_counter.value = 0 valid_counter = multiprocessing.Value('i', len(programs)) gen_examples_pool = multiprocessing.Pool( processes=args.num_workers, initializer=init_gen_examples_worker, initargs=(progress_counter, valid_counter, len(programs), args.num_examples, args.num_example_tries)) res = gen_examples_pool.map(gen_examples_worker, programs) print('') examples = dict(zip(programs, res)) examples = {k: v for k, v in examples.items() if v} return examples
def __init__( self, domain_factory, lambdas=None, nb_domains=os.cpu_count(), ipc_notify=False ): self._domain_factory = domain_factory self._lambdas = lambdas self._active_domains = mp.Array( "b", [False for i in range(nb_domains)], lock=True ) self._initializations = [ mp.Value("b", False, lock=True) for i in range(nb_domains) ] self._conditions = [mp.Condition() for i in range(nb_domains)] self._temp_connections = [None] * nb_domains self._ipc_connections = [None] * nb_domains self._processes = [None] * nb_domains self._ipc_notify = ipc_notify
def initialize(): return mp.Value('d', False)
def initialize(): return [mp.Value('d', 0), mp.Value('b', False)]
def initialize(): return mp.Value('I', 0, lock=True)
def initialize(): return mp.Value("i", False)
def initialize(): return [mp.Value("d", 0), mp.Value("b", False)]
def main(): """ Generates programs. These are the basic steps performed: D = {} for 1 <= i <= max_train_len: 1. P = Generate programs of length i 2. E = Generate examples for the generated programs 3. Discard programs in P that are equivalent to any program in D 4. D += (P, E) for j in test_lengths: Sample num_test programs Discard all programs of equal length in D which are equivalent. Note: 1. Step 3 of the first greatly increases the richness of the dataset. We ensure this way that our programs aren't likely to have shorter equivalents. 2. It is recommended to use --cache to load a dataset cache. The algorithm then continues generating for lengths larger than the maximum length of the cache. This allows incremental dataset generation and also helps with the generation of shorter programs where generation is slow due to randomness. Furthermore, we can (and should!) have virtually all programs of length <=3, to ensure our dataset is meaningful. 3. During test sampling we only compare to programs of equivalent lengths for efficiency. This is since our data generation algorithm already ensures that for all longer and shorter programs there is no equivalence. 4. Since the pruning is done after program generation, rather than during, the number of programs generated in each iteration is NOT args.num_train. This is done purely due to implementation details: it is challenging to discard whilst generating since it would require all processes to write and read from the same dictionary in parallel. However, this is a good feature for the future, to avoid having to try multiple values for num_train via trial-and-error. """ parser = argparse.ArgumentParser() parser.add_argument('--num_train', type=int, required=True) parser.add_argument('--num_test', type=int, required=True) parser.add_argument('--train_output_path', type=str, required=True) parser.add_argument('--test_output_path', type=str, required=True) parser.add_argument('--max_train_len', type=int, required=True) parser.add_argument('--min_train_len', default=5, type=int, required=False) # me parser.add_argument('--test_lengths', type=str, required=True, help="List of test lengths to generate") parser.add_argument('--num_workers', type=int, default=8) parser.add_argument('--num_examples', type=int, default=params.num_examples) parser.add_argument( '--num_example_tries', type=int, default=200, help='total amount of tries to generate examples to try to generate') parser.add_argument( '--cache', type=str, default=None, help="Dataset cache from which to continue generating programs") args = parser.parse_args() test_lens = set([int(x) for x in args.test_lengths.split()]) if args.min_train_len != -1: examples = {} min_len = args.min_train_len - 1 # as following loops start from +1 else: if args.cache: examples = load_cache(args.cache) min_len = max([len(k) for k in examples]) else: examples = {} min_len = 0 for program_len in range(min_len + 1, args.max_train_len + 1): num_programs = args.num_train + args.num_test if program_len in KNOWN_TRAIN_SIZES: num_programs = min(num_programs, KNOWN_TRAIN_SIZES[program_len]) print("Generating programs of length %d (current dataset size: %d)" % (program_len, len(examples))) new_examples = gen_programs(program_len, num_programs, args) existing_programs = list(examples.keys()) counter = multiprocessing.Value('i', 0) new_programs = list(new_examples.keys()) discard_pool = multiprocessing.Pool( processes=args.num_workers, initializer=init_discard_identical_worker, initargs=(existing_programs, counter, len(new_programs))) new_program_parts = [ new_programs[i::args.num_workers] for i in range(args.num_workers) ] new_example_parts = [{p: new_examples[p] for p in programs} for programs in new_program_parts] res = discard_pool.map(discard_identical_worker, new_example_parts) print('') for d in res: examples.update(d) train_programs = list(examples.keys()) print("Finished generation. Total programs: %d" % len(train_programs)) # Generate test programs (they're not equivalent to all shorter programs so only same length needs to be considered) for test_len in test_lens: test_programs = [] test_candidates = [ x for x in train_programs if len(x.statements) == test_len ] train_programs = [ x for x in train_programs if len(x.statements) != test_len ] random.shuffle(test_candidates) indices_to_discard = set() for i, program in enumerate(test_candidates): if len(test_programs) >= args.num_test: break if i in indices_to_discard: continue print("\rCreating test programs for length %d... %d\\%d" % (test_len, len(test_programs), args.num_test), end="") test_programs.append(program) indices_to_discard.add(i) for j, other in enumerate(test_candidates[i + 1:]): if j in indices_to_discard: continue if constraint.is_same(program, other, examples[program]): indices_to_discard.add(j) print('') print("Removed %d programs" % len(indices_to_discard)) train_programs += [ test_candidates[i] for i in range(len(test_candidates)) if i not in indices_to_discard ] output_path = args.test_output_path + '_' + str(test_len) print('Writing %d test programs to %s' % (len(test_programs), output_path)) with open(output_path, 'w') as f: write_programs_to_file(f, test_programs, examples) print('Writing %d train programs to %s' % (len(train_programs), args.train_output_path)) with open(args.train_output_path, 'w') as f: write_programs_to_file(f, train_programs, examples)