def initialize():
     # Don't use "[] * 4" operator since it does not deep copy the pairs but rather
     # copy 4 times the pointers to the same object!
     return [[GridShmProxy.StateProxy.initialize(), mp.Value('d', 0, lock=True)],
             [GridShmProxy.StateProxy.initialize(), mp.Value('d', 0, lock=True)],
             [GridShmProxy.StateProxy.initialize(), mp.Value('d', 0, lock=True)],
             [GridShmProxy.StateProxy.initialize(), mp.Value('d', 0, lock=True)]]
Ejemplo n.º 2
0
 def __init__(self,
              domain_factory,
              shm_proxy,
              lambdas=None,
              nb_domains=os.cpu_count(),
              ipc_notify=False):
     super().__init__(domain_factory, lambdas, nb_domains, ipc_notify)
     self._activations = [
         mp.Value('b', False, lock=True) for i in range(nb_domains)
     ]
     self._dones = [
         mp.Value('b', False, lock=True) for i in range(nb_domains)
     ]
     self._conditions = [mp.Condition() for i in range(nb_domains)]
     self._shm_proxy = shm_proxy
     self._shm_registers = {
     }  # Maps from registered method parameter types to vectorized array ranges
     self._shm_types = {}  # Maps from register index to type
     self._shm_sizes = {
     }  # Maps from register method parameter types to number of arrays encoding each type
     self._shm_arrays = []  # Methods' vectorized parameters
     self._rsize = 0  # Total size of the register (updated below)
     self._shm_lambdas = [None] * nb_domains  # Vectorized lambdas' ids
     self._shm_names = [None] * nb_domains  # Vectorized methods' names
     self._shm_params = [
         None
     ] * nb_domains  # Indices of methods' vectorized parameters
     for i in range(nb_domains):
         j = 0
         for r in shm_proxy.register():
             for k in range(r[1]):
                 m = shm_proxy.initialize(r[0])
                 if type(m) == list or type(m) == tuple:
                     if i == 0 and k == 0:  # do it once for all the domains and redundant initializers
                         self._shm_sizes[r[0].__name__] = len(m)
                         self._shm_registers[r[0].__name__] = (
                             j, j + (r[1] * len(m)))
                         self._shm_types.update({
                             kk: r[0]
                             for kk in range(j, j + (r[1] * len(m)), len(m))
                         })
                         self._rsize += (r[1] * len(m))
                     self._shm_arrays.extend(m)
                     j += len(m)
                 else:
                     if i == 0 and k == 0:  # do it once for all the domains and redundant initializers
                         self._shm_sizes[r[0].__name__] = 1
                         self._shm_registers[r[0].__name__] = (j, j + r[1])
                         self._shm_types.update(
                             {kk: r[0]
                              for kk in range(j, j + r[1])})
                         self._rsize += r[1]
                     self._shm_arrays.append(m)
                     j += 1
         self._shm_lambdas[i] = mp.Value('i', -1, lock=True)
         self._shm_names[i] = mp.Array('c', bytearray(100))
         self._shm_params[i] = mp.Array(
             'i', [-1] * sum(r[1] for r in shm_proxy.register()))
     logger.info(rf'Using {nb_domains} parallel shared memory domains')
Ejemplo n.º 3
0
    def start(self):

        max_queue_size = 1 if self.ordered else self.max_queue_size // 2

        self.queue = multip.Queue(
            maxsize=max_queue_size) if self.multiprocess else Queue.Queue(
                maxsize=self.max_queue_size)

        # Flag used for keeping values in completed queue in order
        self.last_completed_job = multip.Value('i', -1)
        self.exit = multip.Event()

        if self.multiprocess and self.ordered:
            self.cache_queue = Queue.Queue(maxsize=self.max_queue_size)

            def batcher(queue, cache_queue):
                while not self.exit.is_set():
                    job_index, item = queue.get()
                    cache_queue.put((job_index, item))

                    time.sleep(0.0001)  #to be sure..

            # As Queues in Python are __!__NOT__!__ First in first out in a multiprocessing setting
            # We use a seperate thread to synchronously put them in order
            p = Thread(target=batcher,
                       args=(self.queue, self.cache_queue),
                       name='Synchronous batcher worker')
            p.daemon = True
            p.start()

        else:
            self.cache_queue = self.queue

        # Start worker processes or threads
        for i in xrange(self.n_producers):
            name = "ContinuousParallelBatchIterator worker {0}".format(i)

            if self.multiprocess:
                p = multip.Process(target=_produce_helper,
                                   args=(i, self.generator, self.job_queue,
                                         self.queue, self.last_completed_job,
                                         self.ordered, self.exit),
                                   name=name)
            else:
                p = Thread(target=_produce_helper,
                           args=(i, self.generator, self.job_queue, self.queue,
                                 self.last_completed_job, self.ordered,
                                 self.exit),
                           name=name)

            # Make the process daemon, so the main process can die without these finishing
            p.daemon = True
            p.start()

        self.started = True
Ejemplo n.º 4
0
def gen_programs(program_len, num_programs, args):
    """
    Generates the specified amount of programs of the given length. These are the exact steps performed:
    1. Generate <num_programs> programs using gen_program_worker in a process pool
    2. Generate examples for each program by executing gen_examples_worker in a process pool.
       Discard programs for which the required amount of examples could not be generated.
    3. Return a dictionary of the form {program: examples}
    """
    progress_counter = multiprocessing.Value('i', 0)
    gen_prog_pool = multiprocessing.Pool(processes=args.num_workers,
                                         initializer=init_gen_prog_worker,
                                         initargs=(progress_counter,
                                                   num_programs, program_len))

    input_type_combinations = get_input_type_combinations(params.num_inputs)
    programs = gen_prog_pool.map(gen_program_worker, input_type_combinations)
    print('')

    # Flatten
    programs = [item for sublist in programs for item in sublist]
    programs = list(set(programs))

    # Generate examples and filter out null programs
    progress_counter.value = 0
    valid_counter = multiprocessing.Value('i', len(programs))
    gen_examples_pool = multiprocessing.Pool(
        processes=args.num_workers,
        initializer=init_gen_examples_worker,
        initargs=(progress_counter, valid_counter, len(programs),
                  args.num_examples, args.num_example_tries))

    res = gen_examples_pool.map(gen_examples_worker, programs)
    print('')
    examples = dict(zip(programs, res))
    examples = {k: v for k, v in examples.items() if v}
    return examples
Ejemplo n.º 5
0
 def __init__(
     self, domain_factory, lambdas=None, nb_domains=os.cpu_count(), ipc_notify=False
 ):
     self._domain_factory = domain_factory
     self._lambdas = lambdas
     self._active_domains = mp.Array(
         "b", [False for i in range(nb_domains)], lock=True
     )
     self._initializations = [
         mp.Value("b", False, lock=True) for i in range(nb_domains)
     ]
     self._conditions = [mp.Condition() for i in range(nb_domains)]
     self._temp_connections = [None] * nb_domains
     self._ipc_connections = [None] * nb_domains
     self._processes = [None] * nb_domains
     self._ipc_notify = ipc_notify
Ejemplo n.º 6
0
 def initialize():
     return mp.Value('d', False)
Ejemplo n.º 7
0
 def initialize():
     return [mp.Value('d', 0), mp.Value('b', False)]
Ejemplo n.º 8
0
 def initialize():
     return mp.Value('I', 0, lock=True)
Ejemplo n.º 9
0
 def initialize():
     return mp.Value("i", False)
Ejemplo n.º 10
0
 def initialize():
     return [mp.Value("d", 0), mp.Value("b", False)]
Ejemplo n.º 11
0
def main():
    """
    Generates programs. These are the basic steps performed:

    D = {}
    for 1 <= i <= max_train_len:
       1. P = Generate programs of length i
       2. E = Generate examples for the generated programs
       3. Discard programs in P that are equivalent to any program in D
       4. D += (P, E)

    for j in test_lengths:
      Sample num_test programs
      Discard all programs of equal length in D which are equivalent.

    Note:
        1. Step 3 of the first greatly increases the richness of the dataset. We ensure this way that
           our programs aren't likely to have shorter equivalents.
        2. It is recommended to use --cache to load a dataset cache. The algorithm then continues generating
           for lengths larger than the maximum length of the cache. This allows incremental dataset generation and
           also helps with the generation of shorter programs where generation is slow due to randomness. Furthermore,
           we can (and should!) have virtually all programs of length <=3, to ensure our dataset is meaningful.
        3. During test sampling we only compare to programs of equivalent lengths for efficiency. This is since
           our data generation algorithm already ensures that for all longer and shorter programs there is no
           equivalence.
        4. Since the pruning is done after program generation, rather than during, the number of programs generated
           in each iteration is NOT args.num_train. This is done purely due to implementation details: it is
           challenging to discard whilst generating since it would require all processes to write and read from
           the same dictionary in parallel. However, this is a good feature for the future, to avoid having to
           try multiple values for num_train via trial-and-error.
    """
    parser = argparse.ArgumentParser()

    parser.add_argument('--num_train', type=int, required=True)
    parser.add_argument('--num_test', type=int, required=True)
    parser.add_argument('--train_output_path', type=str, required=True)
    parser.add_argument('--test_output_path', type=str, required=True)
    parser.add_argument('--max_train_len', type=int, required=True)
    parser.add_argument('--min_train_len', default=5, type=int,
                        required=False)  # me
    parser.add_argument('--test_lengths',
                        type=str,
                        required=True,
                        help="List of test lengths to generate")
    parser.add_argument('--num_workers', type=int, default=8)
    parser.add_argument('--num_examples',
                        type=int,
                        default=params.num_examples)
    parser.add_argument(
        '--num_example_tries',
        type=int,
        default=200,
        help='total amount of tries to generate examples to try to generate')
    parser.add_argument(
        '--cache',
        type=str,
        default=None,
        help="Dataset cache from which to continue generating programs")
    args = parser.parse_args()

    test_lens = set([int(x) for x in args.test_lengths.split()])

    if args.min_train_len != -1:
        examples = {}
        min_len = args.min_train_len - 1  # as following loops start from +1
    else:
        if args.cache:
            examples = load_cache(args.cache)
            min_len = max([len(k) for k in examples])
        else:
            examples = {}
            min_len = 0

    for program_len in range(min_len + 1, args.max_train_len + 1):
        num_programs = args.num_train + args.num_test
        if program_len in KNOWN_TRAIN_SIZES:
            num_programs = min(num_programs, KNOWN_TRAIN_SIZES[program_len])

        print("Generating programs of length %d (current dataset size: %d)" %
              (program_len, len(examples)))
        new_examples = gen_programs(program_len, num_programs, args)

        existing_programs = list(examples.keys())
        counter = multiprocessing.Value('i', 0)
        new_programs = list(new_examples.keys())
        discard_pool = multiprocessing.Pool(
            processes=args.num_workers,
            initializer=init_discard_identical_worker,
            initargs=(existing_programs, counter, len(new_programs)))
        new_program_parts = [
            new_programs[i::args.num_workers] for i in range(args.num_workers)
        ]

        new_example_parts = [{p: new_examples[p]
                              for p in programs}
                             for programs in new_program_parts]
        res = discard_pool.map(discard_identical_worker, new_example_parts)
        print('')
        for d in res:
            examples.update(d)

    train_programs = list(examples.keys())
    print("Finished generation. Total programs: %d" % len(train_programs))

    # Generate test programs (they're not equivalent to all shorter programs so only same length needs to be considered)
    for test_len in test_lens:
        test_programs = []
        test_candidates = [
            x for x in train_programs if len(x.statements) == test_len
        ]
        train_programs = [
            x for x in train_programs if len(x.statements) != test_len
        ]

        random.shuffle(test_candidates)
        indices_to_discard = set()
        for i, program in enumerate(test_candidates):
            if len(test_programs) >= args.num_test:
                break
            if i in indices_to_discard:
                continue

            print("\rCreating test programs for length %d... %d\\%d" %
                  (test_len, len(test_programs), args.num_test),
                  end="")

            test_programs.append(program)
            indices_to_discard.add(i)

            for j, other in enumerate(test_candidates[i + 1:]):
                if j in indices_to_discard:
                    continue
                if constraint.is_same(program, other, examples[program]):
                    indices_to_discard.add(j)
        print('')

        print("Removed %d programs" % len(indices_to_discard))
        train_programs += [
            test_candidates[i] for i in range(len(test_candidates))
            if i not in indices_to_discard
        ]

        output_path = args.test_output_path + '_' + str(test_len)
        print('Writing %d test programs to %s' %
              (len(test_programs), output_path))
        with open(output_path, 'w') as f:
            write_programs_to_file(f, test_programs, examples)

    print('Writing %d train programs to %s' %
          (len(train_programs), args.train_output_path))
    with open(args.train_output_path, 'w') as f:
        write_programs_to_file(f, train_programs, examples)