def _instances(self, file_path: str, manager: Manager, output_queue: Queue) -> Iterator[Instance]:
        """
        A generator that reads instances off the output queue and yields them up
        until none are left (signified by all ``num_workers`` workers putting their
        ids into the queue).
        """
        shards = glob.glob(file_path)
        num_shards = len(shards)

        # If we want multiple epochs per read, put shards in the queue multiple times.
        input_queue = manager.Queue(num_shards * self.epochs_per_read + self.num_workers)
        for _ in range(self.epochs_per_read):
            random.shuffle(shards)
            for shard in shards:
                input_queue.put(shard)

        # Then put a None per worker to signify no more files.
        for _ in range(self.num_workers):
            input_queue.put(None)

        processes: List[Process] = []
        num_finished = 0

        for worker_id in range(self.num_workers):
            process = Process(target=_worker,
                              args=(self.reader, input_queue, output_queue, worker_id))
            logger.info(f"starting worker {worker_id}")
            process.start()
            processes.append(process)

        # Keep going as long as not all the workers have finished.
        while num_finished < self.num_workers:
            item = output_queue.get()
            if isinstance(item, int):
                # Means a worker has finished, so increment the finished count.
                num_finished += 1
                logger.info(f"worker {item} finished ({num_finished}/{self.num_workers})")
            else:
                # Otherwise it's an ``Instance``, so yield it up.
                yield item

        for process in processes:
            process.join()
        processes.clear()
    def dist_train(self):
        gpu_ids_avail = GPUtil.getAvailable(maxMemory=0.02, limit=8)
        shuffle(gpu_ids_avail)
        gpu_ids = gpu_ids_avail[:self.world_size]
        assert len(gpu_ids) == self.world_size, "not enough GPUs"
        processes = []
        for rank, gpu_id in enumerate(gpu_ids):
            p = Process(target=self._dist_train, args=(rank, gpu_id))
            p.start()
            print(f"process {rank} has started")
            processes.append(p)

        for p in processes:
            p.join()
def ansyc_multiple_process_train(args, save_dir):
    q = Queue(10)
    data_lists = [build_datasets(args) for _ in range(args.gpus)]

    p_producer = Process(target=data_producers, args=(args, q))
    p_consumers = [
        Process(target=data_consumers,
                args=(args, q, save_dir, i, data_lists[i]))
        for i in range(args.gpus)
    ]

    p_producer.start()
    for p in p_consumers:
        p.start()

    p_producer.join()
    for p in p_consumers:
        p.join()
Exemple #4
0
    def __init__(self, env_fns, spaces=None):
        """
        envs: list of gym environments to run in subprocesses
        """
        self.waiting = False
        self.closed = False
        nenvs = len(env_fns)
        self.nenvs = nenvs
        self.remotes, self.work_remotes = zip(*[Pipe() for _ in range(nenvs)])
        self.ps = [Process(target=worker, args=(work_remote, remote, CloudpickleWrapper(env_fn)))
            for (work_remote, remote, env_fn) in zip(self.work_remotes, self.remotes, env_fns)]
        for p in self.ps:
            p.daemon = True # if the main process crashes, we should not cause things to hang
            p.start()
        for remote in self.work_remotes:
            remote.close()

        self.remotes[0].send(('get_spaces', None))
        observation_space, action_space = self.remotes[0].recv()
        VecEnv.__init__(self, len(env_fns), observation_space, action_space)
def main():
    print('Starting')
    parser = argparse.ArgumentParser()
    # Configurable hyperparameters
    parser.add_argument('--rows', type=int, default=1,
                        help='Number of rows in the tensor.')
    parser.add_argument('--columns', type=int, default=1,
                        help='Number of columns in the tensor.')
    parser.add_argument('--backend', type=str, default=None,
                        help='backend for distributed operations.')

    # Container environment
    parser.add_argument('--hosts', type=list, default=json.loads(os.environ["SM_HOSTS"]))
    parser.add_argument('--current-host', type=str, default=os.environ["SM_CURRENT_HOST"])
    parser.add_argument('--model-dir', type=str, default=os.environ["SM_MODEL_DIR"])
    parser.add_argument('--num-gpus', type=int, default=os.environ["SM_NUM_GPUS"])
    parser.add_argument('--num-cpus', type=int, default=os.environ["SM_NUM_CPUS"])

    args = parser.parse_args()

    number_of_processes = args.num_gpus if args.num_gpus > 0 else args.num_cpus
    world_size = number_of_processes * len(args.hosts)
    logger.info('Running \'{}\' backend on {} nodes and {} processes. World size is {}.'.format(
        args.backend, len(args.hosts), number_of_processes, world_size
    ))
    host_rank = args.hosts.index(args.current_host)
    master_addr = args.hosts[0]
    master_port = '55555'
    processes = []
    for rank in range(number_of_processes):
        process_rank = host_rank * number_of_processes + rank
        p = Process(
            target=init_processes,
            args=(args.backend,
                  master_addr,
                  master_port,
                  process_rank,
                  world_size,
                  args.rows,
                  args.columns,
                  args.current_host)
        )
        p.start()
        processes.append(p)

    for p in processes:
        p.join()

    save('success', args.model_dir)
Exemple #6
0
    def _generate_parallel(self, iteration, network, device, num_workers):
        q, r = divmod(self.remaining_games, num_workers)
        num_active_workers = Value('i', num_workers)
        resign_threshold = Value('d', self.resign_mgr.threshold())
        evaluator_mgr = BulkEvaluatorManager([network], device, num_workers)
        output_queue = SimpleQueue()

        # start the workers
        workers = []
        for worker_id in range(num_workers):
            num_games = q + 1 if worker_id < r else q
            evaluator = evaluator_mgr.get_evaluator(worker_id, 0)
            worker = Process(
                target=self._worker_job,
                args=(worker_id, num_games, num_active_workers,
                      resign_threshold, evaluator, output_queue),
            )
            workers.append(worker)
            worker.start()

        # start evaluator server
        server = evaluator_mgr.get_server(num_active_workers)
        server.start()

        # collect the examples generated by workers
        while num_active_workers.value > 0 or not output_queue.empty():
            examples, resign_value_history, result = output_queue.get()
            self.example_pool += examples
            self.game_length.append(len(examples))

            # add the history into resignation manager to update the threshold
            if resign_value_history is not None:
                self.resign_mgr.add(resign_value_history, result)
                resign_threshold.value = self.resign_mgr.threshold()

            self.remaining_games -= 1

            # periodically save the progress
            if (self.conf.GAMES_PER_ITERATION - self.remaining_games) \
                    % self.conf.EXAMPLE_POOL_SAVE_FREQUENCY == 0:
                self.save(iteration)
                log.info(
                    f'[iter={iteration}] ExamplePool: checkpoint saved, '
                    f'{self.remaining_games} games remaining'
                )

        for worker in workers:
            worker.join()
        server.join()
    def _instances(self, file_path: str, manager: Manager,
                   output_queue: Queue) -> Iterator[Instance]:
        """
        A generator that reads instances off the output queue and yields them up
        until none are left (signified by all ``num_workers`` workers putting their
        ids into the queue).
        """
        shards = glob.glob(file_path)
        num_shards = len(shards)

        # If we want multiple epochs per read, put shards in the queue multiple times.
        input_queue = manager.Queue(num_shards * self.epochs_per_read +
                                    self.num_workers)
        for _ in range(self.epochs_per_read):
            random.shuffle(shards)
            for shard in shards:
                input_queue.put(shard)

        # Then put a None per worker to signify no more files.
        for _ in range(self.num_workers):
            input_queue.put(None)

        processes: List[Process] = []
        num_finished = 0

        for worker_id in range(self.num_workers):
            process = Process(target=_worker,
                              args=(self.reader, input_queue, output_queue,
                                    worker_id))
            logger.info(f"starting worker {worker_id}")
            process.start()
            processes.append(process)

        # Keep going as long as not all the workers have finished.
        while num_finished < self.num_workers:
            item = output_queue.get()
            if isinstance(item, int):
                # Means a worker has finished, so increment the finished count.
                num_finished += 1
                logger.info(
                    f"worker {item} finished ({num_finished}/{self.num_workers})"
                )
            else:
                # Otherwise it's an ``Instance``, so yield it up.
                yield item

        for process in processes:
            process.join()
        processes.clear()
Exemple #8
0
    def __init__(self, n_workers, actor, args):
        self._now_episode = Value('i', 0)

        self.queue = Queue()
        self.collect_event = Event()

        self.worker = []
        for i in range(n_workers):
            self.worker.append(
                Worker(self.queue, self.collect_event, actor, args))
            time.sleep(1)

        self.process = [
            Process(target=self.worker[i].run, args=(self._now_episode, ))
            for i in range(n_workers)
        ]

        for p in self.process:
            p.start()
        print(f'Start {n_workers} workers.')
Exemple #9
0
def run(args):

    # setup
    N, D = args.N, args.D
    n_examples = args.n_train_examples
    n_threads = args.n_threads
    n_examples_per_thread = n_examples // n_threads

    # create policy network
    policy_network = alphatsp.util.get_policy_network(args.policy_network)

    # generate examples
    print("Generating examples and training...")

    manager = Manager()
    train_queue = manager.Queue()
    shared_dict = manager.dict()

    shared_dict["success"] = False

    producers = []
    for _ in range(n_threads):
        producers.append(
            Process(target=generate_examples,
                    args=(n_examples_per_thread, train_queue, args)))

    for p in producers:
        p.start()

    c = Process(target=train,
                args=(policy_network, train_queue, shared_dict, args))
    c.start()

    for p in producers:
        p.join()
    train_queue.put(None)

    c.join()

    status = shared_dict["success"]
    if not status:
        print("Experiment failed.")
        return -1
    def _call_with_instances(self,
                             instances: Iterable[Instance],
                             num_epochs: int,
                             shuffle: bool) -> Iterator[TensorDict]:
        # JoinableQueue needed here as sharing tensors across processes
        # requires that the creating process not exit prematurely.
        output_queue = JoinableQueue(self.output_queue_size)
        input_queue = Queue(self.output_queue_size * self.batch_size)

        # Start process that populates the queue.
        self.queuer = Process(target=_queuer,
                              args=(instances, input_queue, self.num_workers, num_epochs))
        self.queuer.start()

        # Start the tensor-dict workers.
        for i in range(self.num_workers):
            args = (input_queue, output_queue, self.iterator, shuffle, i)
            process = Process(target=_create_tensor_dicts_from_queue, args=args)
            process.start()
            self.processes.append(process)

        num_finished = 0
        while num_finished < self.num_workers:
            item = output_queue.get()
            output_queue.task_done()
            if isinstance(item, int):
                num_finished += 1
                logger.info(f"worker {item} finished ({num_finished} / {self.num_workers})")
            else:
                yield item

        for process in self.processes:
            process.join()
        self.processes.clear()

        if self.queuer is not None:
            self.queuer.join()
            self.queuer = None
Exemple #11
0
def run_in_process_group(world_size, fn, input):
    assert not dist.is_initialized()
    processes = []
    q = Queue()
    for rank in range(world_size - 1):
        p = Process(target=init_process, args=(rank, world_size, fn, input, q))
        p.start()
        processes.append(p)

    if world_size >= 1:
        # run 1 process in current unittest process for debug purpose
        init_process(world_size - 1, world_size, fn, input, q)

    for p in processes:
        p.join()
    return q
    def train(self, data_loaders, tb=None, num_updates=5, num_iters=250000):
        data_queue = Queue()
        # for notifying when to recieve data
        data_event = Event()
        # for notifying this method when to send new data
        process_event = Event()
        # so doesn't hang on first iteration
        process_event.set()
        num_tasks = len(data_loaders)

        processes = []
        for process_id in range(self.world_size):
            processes.append(
                Process(target=self.init_process,
                        args=(process_id, data_queue, data_event,
                              process_event, num_updates,
                              tb if process_id == 0 else None)))
            processes[-1].start()

        for num_iter in range(num_iters):
            print("at the top of iter loop %d" % num_iter)
            process_event.wait()
            process_event.clear()
            tasks = np.random.randint(0, num_tasks, (self.world_size))
            for task in tasks:
                # place holder for sampling data from dataset
                task_data = next(data_loaders[task])

                # print(hey[0].shape)
                data_queue.put(
                    (task_data[0].numpy()[0], task_data[1].numpy()[0],
                     task_data[2].numpy()[0], task_data[3].numpy()[0]))

            data_event.set()

        new_model = self.meta_learners[0].model.original_state_dict

        for p in processes:
            p.terminate()
            p.join()
Exemple #13
0
def test_mem_share(share_memory):
    p = Process(target=torch_shared_mem_process, args=(share_memory, ))
    p.start()

    start = time.time()
    n = 100
    for i in range(n):
        data = torch.zeros([5, 1280, 720, 3], dtype=torch.float, pin_memory=True)
        if share_memory:
            data.share_memory_()
            q.put(data)
        else:
            q.put(data.numpy())
    q.put(None)
    p.join()
    return time.time() - start
Exemple #14
0
def subprocess_prefetch(generator: Iterable[Union[np.array, Iterable[np.array]]],
                        prefetch_buffer_size: int,
                        )->Iterable[Union[np.array, Iterable[np.array]]]:
    """
    Wraps a generator to prefect batches in a separate subprocess. It can
    be used in a `with` block (which grants proper resource cleanup) or
    directly as a normal generator. It relies on the ability of
    torch.multiprocessing to load Tensors in shared memory; this way,
    the subprocess loads the numpy array from disk, creates a torch Tensor
    from it and then sends it through a Queue to the main process, which
    consumes it.

    :param generator: Generator to wrap.
    :param prefetch_buffer_size: Size of the prefetch buffer.
    :return: Wrapped generator.
    """
    batch_queue = Queue(prefetch_buffer_size)
    control_queue = Queue()
    Process(target=_enqueue_loader_output,
            args=(batch_queue, control_queue, generator)).start()
    control_queue.put(True)
    return _BatchIterator(batch_queue, control_queue)
Exemple #15
0
    def __iter__(self):
        print('Starting processes')
        random.seed(0)
        random.shuffle(self.filepaths)
        filepaths = deque()
        for path in self.filepaths:
            filepaths.append(path)
        self.buffr_processes = []
        args = (self.filepaths, self.buffer, self.partial)
        for i in range(10):
            process = Process(target=fill_buffer, args=args)
            process.daemon = True
            process.start()
            self.buffr_processes.append(process)

        args = (self.buffer, self.batch_queue, self.batch_size)
        self.batch_process = Process(target=fill_batch, args=args)
        self.batch_process.daemon = True
        self.batch_process.start()
        return self
Exemple #16
0
def trace(size: int, model: nn.Module, data_partitions: DataPartitioner,
          criterion: Callable, ip: str):
    processes = []
    queue = Queue()
    for rank in range(size):
        p = Process(target=init_process,
                    args=(rank, size, model, data_partitions.use(rank),
                          criterion, queue, trace_, ip))
        p.start()
        processes.append(p)

    trace = queue.get()

    for p in processes:
        p.join()

    return trace
Exemple #17
0
def main(args):
    args.embedding_output_path = join(args.EMBED_DATA_DIR, args.output_path) 
    print('> START  ')
    print('> parameter  ')
    for k, v in args._get_kwargs():
        print('> {} : {}'.format(k, v))
    print('')
    print('> Action ')
    pool_path = args.data_path
    GPU_NUM = args.gpu_num
    embedding_type_name = args.embedding_type_name
    output_embedding_data = args.embedding_output_path 

    sym_line_list = [ _.strip() for _ in open(pool_path, mode = 'r' , encoding= 'utf-8')]
    
    number_of_processes = GPU_NUM if GPU_NUM < len(sym_line_list) else len(sym_line_list)
    num_of_tasks = len(sym_line_list)//number_of_processes    
    
    tasks = [sym_line_list[_ * num_of_tasks : (_ + 1) * num_of_tasks] for _ in range(number_of_processes)]
    
    
    tasks_to_accomplish = Manager().Queue()
    for task in tasks:
        tasks_to_accomplish.put(task)
    tasks_finished = Manager().Queue()
    processes = []
    # creating processes
    for i in range(number_of_processes):
        p = Process(target = make_embedding, args = (tasks_to_accomplish, tasks_finished, i,embedding_type_name , ))
        processes.append(p)
        p.start()
    store_target = []
    
    for p in processes:
        p.join()

    while not tasks_finished.empty():
        store_target.append(tasks_finished.get_nowait())
    
    with open(output_embedding_data, 'wb') as f:
        pickle.dump(store_target, f, pickle.HIGHEST_PROTOCOL)

    
    
    
    return True
def ansyc_multiple_process_train(args, save_dir):
    q = Queue(10)
    metann_params = meta_neuralnet_params(args.search_space)
    data_lists = [
        build_datasets(metann_params['search_space']) for _ in range(args.gpus)
    ]

    p_producer = Process(target=data_producers, args=(args, q))
    p_consumers = [
        Process(target=data_consumers,
                args=(args, q, save_dir, i, data_lists[i]))
        for i in range(args.gpus)
    ]

    p_producer.start()
    for p in p_consumers:
        p.start()

    p_producer.join()
    for p in p_consumers:
        p.join()
	def __init__(self, args):
		self.args = args

		######### Load the trained model ########

		self.test_agent = TestAgent(self.args, 991)

		self.test_bucket = self.test_agent.rollout_actor

		######### TEST WORKERS ############
		self.test_task_pipes = Pipe()
		self.test_result_pipes = Pipe()
		self.test_workers = [Process(target=test_rollout_worker,
		                             args=(self.args, 0, 'test', self.test_task_pipes[1], self.test_result_pipes[0],
		                                   None, self.test_bucket, False,
		                                   RANDOM_BASELINE))]  # test_bucket is the neural network for evo
		for worker in self.test_workers: worker.start()

		#### STATS AND TRACKING WHICH ROLLOUT IS DONE ######
		self.best_score = -999;
		self.total_frames = 0;
		self.gen_frames = 0;
		self.test_trace = []
Exemple #20
0
def experiment_join(experiment_name, your_code, skip_tests=False):
    """Join an experiment, passing a function for your experiment code"""
    if not skip_tests:
        experiment_is_enabled = get_global(
            f"experiment:{experiment_name}:enabled", type=bool)
        if not experiment_is_enabled:
            raise RuntimeError(
                "It is too early to run this experiment. Please wait for a bit."
            )

        experiment_is_running = get_global(
            f"experiment:{experiment_name}:running", type=bool)
        if experiment_is_running:
            raise RuntimeError(
                "This experiment is already running. You are too late to join. Sorry."
            )

    # Register ourself
    redis.zadd(f"experiment:{experiment_name}:participants", {WORKER_ID: 1})

    def fn():
        print("Waiting for more workers to join. Thanks for your patience ...")

        # Wait until we receive a rank from the master node
        rank = receive(f"experiment:{experiment_name}:rank",
                       blocking=True,
                       type=int)
        world_size = receive(f"experiment:{experiment_name}:world_size",
                             blocking=True,
                             type=int)

        pytorch_distributed_init(experiment_name, rank, world_size)
        print(
            f"Connected to {world_size} workers. You are worker number {rank}. Starting ..."
        )

        start_time = time()
        your_code()
        duration = time() - start_time

        print(f"Execution finished in {duration:.1f}s.")

    p = Process(target=fn)
    p.start()
    p.join()
Exemple #21
0
def density(size: int, model: nn.Module, data_partitions: DataPartitioner,
            criterion: Callable, ip: str):
    processes = []
    queue = Queue()
    for rank in range(size):
        p = Process(target=init_process,
                    args=(rank, size, model, data_partitions.use(rank),
                          criterion, queue, density_, ip))
        p.start()
        processes.append(p)

    eigen_list_full = queue.get()
    weight_list_full = queue.get()

    for p in processes:
        p.join()

    return eigen_list_full, weight_list_full
def collectGameDataParallel(network, useNetwork, T, width, height):
    totalGames = 0
    game_images = []
    game_targets = []
    while totalGames < 80:
        images = Queue()
        targets = Queue()
        ngames = 5
        barrier = Barrier(ngames + 1)

        processes=[Process(target=collectGameData, args=(barrier,play_game, network,\
                             useNetwork, T, width,height, images, targets)) \
                                for _ in range(ngames)]
        for p in processes:
            p.start()

        for _ in range(ngames):
            im = images.get()
            game_images.append(copy.deepcopy(im))
            del im
            t = targets.get()
            game_targets.append(copy.deepcopy(t))
            del t
        barrier.wait()

        for p in processes:
            p.join()
        totalGames += ngames
    flattened_images = list(itertools.chain.from_iterable(game_images))
    flattened_targets = list(itertools.chain.from_iterable(game_targets))
    batchSize = min(len(flattened_images), 2048)
    sample_indices = numpy.random.choice(range(len(flattened_images)),
                                         batchSize)
    sample_images = [flattened_images[i] for i in sample_indices]
    sample_targets = [flattened_targets[i] for i in sample_indices]

    return sample_images, sample_targets
    def test_step(self):
        def _run(rank, world_size):
            model = nn.Linear(10, 1)
            optimizer = DistributedAdamW(model.parameters())

            optimizer.zero_grad()
            loss = model(torch.ones(10).float())
            loss.backward()
            optimizer.step()

        processes = []
        world_size = 4
        for rank in range(world_size):
            p = Process(target=init_processes, args=(rank, world_size, _run))
            p.start()
            processes.append(p)
        for p in processes:
            p.join()
Exemple #24
0
    def __init__(self, fn, n_processes=4, max_size=200, batchsize=200):
        ##what needs to happen:
        def consumer(Q):
            iterator = get_supervised_batchsize(fn, batchsize=batchsize)  #todo
            while True:
                try:
                    # get a new message
                    size = Q.qsize()
                    #print(size)
                    if size < max_size:
                        # process the data
                        ret = next(iterator)
                        Q.put(ret)
                    else:
                        time.sleep(2)
                except ValueError as e:
                    print(
                        "I think you closed the thing while it was running, but that's okay"
                    )
                    break
                except Exception as e:
                    print("error!", e)
                    break

        self.Q = Queue()
        print("started queue ...")

        # instantiate workers
        self.workers = [
            Process(target=consumer, args=(self.Q, ))
            for i in range(n_processes)
        ]

        for w in self.workers:
            w.start()
        print("started parallel workers, ready to work!")
Exemple #25
0
def create_training_processes(training_jobs, createNewEnvironment,
                              checkpoint_at_iterations, agent_queue,
                              results_path, seed):
    """
    :param training_jobs: Array of TrainingJob namedtuples containing a training-scheme, algorithm and name
    :param createNewEnvironment: OpenAI gym environment creation function
    :param checkpoint_at_iterations: array containing the episodes at which the agents will be cloned for benchmarking against one another
    :param agent_queue: queue shared among processes to submit agents that will be benchmarked
    :returns: array of process handlers, needed to join processes at the end of experiment computation
    """
    episodic_reward_directory = f'{results_path}/episodic_rewards'
    if not os.path.exists(episodic_reward_directory):
        os.mkdir(episodic_reward_directory)

    logger = logging.getLogger('CreateTrainingProcesses')
    logger.setLevel(logging.DEBUG)
    logger.addHandler(
        logging.handlers.SocketHandler(
            host='localhost', port=logging.handlers.DEFAULT_TCP_LOGGING_PORT))
    logger.info('Training {} jobs: [{}]. '.format(
        len(training_jobs), ', '.join(map(lambda job: job.name,
                                          training_jobs))))

    menagerie_path = f'{results_path}/menageries'
    if not os.path.exists(menagerie_path):
        os.mkdir(menagerie_path)

    ps = []
    for job in training_jobs:
        p = Process(target=training_process,
                    args=(createNewEnvironment(), job.agent,
                          job.training_scheme, checkpoint_at_iterations,
                          agent_queue, job.name, results_path, seed))
        ps.append(p)
    logger.info("All training jobs submitted")
    return ps
Exemple #26
0
    def run_parallel_episodes(self, max_episodes, max_steps):
        for episode in range(max_episodes):
            print("Episode : {}".format(episode + 1))
            agent_list = []
            for i in range(self.num_agents):
                agent_list.append(np.random.randint(C.MIN_NODES, C.MAX_NODES))

            arr = Array('i', agent_list)
            m = Manager()
            printlock = m.Lock()
            synchlock = m.Lock()
            all_processes = [
                Process(target=self.sample_run,
                        args=(30, max_steps, printlock, synchlock, arr,
                              episode, j)) for j in range(self.num_agents)
            ]
            for p in all_processes:
                p.start()

            for p in all_processes:
                p.join()

            for p in all_processes:
                p.terminate()
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--ip', type=str, default='192.168.0.12')
    parser.add_argument('--port', type=str, default='20000')
    parser.add_argument('--rank', '-r', type=int)
    parser.add_argument('--world-size', '-s', type=int)
    args = parser.parse_args()
    print(args)
    # initialize(args.rank, args.world_size, args.ip, args.port)

    size = 2
    processes = []
    for i in range(size):
        p = Process(target=initialize, args=(i, size, args.ip, args.port))
        p.start()
        processes.append(p)

    for p in processes:
        p.join()
def async_macro_model_train(model_data, gpus, save_dir, dataset='cifar10'):
    q = Queue(10)
    manager = multiprocessing.Manager()
    total_data_dict = manager.dict()
    p_producer = Process(target=model_producer, args=(model_data, q, gpus))
    time.sleep(3)
    p_consumers = [
        Process(target=model_consumer,
                args=(q, i, save_dir, total_data_dict, model_data, dataset))
        for i in range(gpus)
    ]
    p_producer.start()

    for p in p_consumers:
        p.start()

    p_producer.join()
    for p in p_consumers:
        p.join()

    data_dict = {}
    for k, v in total_data_dict.items():
        data_dict[v[2]] = (100 - v[0], 100 - v[1])
    return data_dict
def run_in_process_group(world_size, fn, input):
    assert not dist.is_initialized()
    processes = []
    q = Queue()
    port = get_free_tcp_port()
    log.info(f"using tcp port: {port}")
    backend = "gloo"
    for rank in range(world_size - 1):
        p = Process(
            target=init_process, args=(rank, world_size, fn, input, q, backend, port)
        )
        p.start()
        processes.append(p)

    if world_size >= 1:
        # run 1 process in current unittest process for debug purpose
        init_process(world_size - 1, world_size, fn, input, q, backend, port)

    for p in processes:
        p.join()
    return q
def multi_process():
    # 定义共享变量,在进程之间共享
    shared_num=mp.Value('i', 0)
    scores=mp.Queue()
    processes = []
    results=[]
    for i in range(4):
        p = Process(target=f,args=(shared_num,scores))
        p.start()
        processes.append(p)
    while True:
        r = scores.get()
        if r is not None:
            results.append(r)
            print("!!! ", results)
        else:
            break
    for p in processes:
        p.join()

    print("EDN !!! ",results)
Exemple #31
0
def main(args):
    load_config_json(args)
    check_and_update_generation_args(args)
    adjust_multilingual_eval(args)
    set_seed(args)
    args.tasks = list(get_tasks(args.task_names, args).values())

    logger.info(f'Arguments:\n{pformat(vars(args))}')
    logger.info(f'Loading from {args.best_checkpoint}')

    devices = init_devices(args)
    if args.devices is not None:
        devices = [devices[i] for i in args.devices]

    if len(devices) > 1:
        # Independent multi-GPU generation
        all_processes = []
        all_data_folders = split_folder_on_disk(args.data, len(devices))

        for device_id in range(len(devices)):
            copy_args = copy.copy(args)
            copy_args.data = all_data_folders[device_id]
            copy_args.eval_dir = get_part_path(args.eval_dir, device_id)

            p = Process(target=run, args=(copy_args, devices[device_id]))
            all_processes.append(p)
            p.start()

        for p in all_processes:
            p.join()

        for folder in all_data_folders:
            shutil.rmtree(folder)
        combine_folders_on_disk(args.eval_dir,
                                len(devices),
                                line_group_size=1,
                                delete=True)

    else:
        run(args, devices[0])