Esempio n. 1
0
    def start_worker(self):
        self.workers = []
        self.shared_que = self.manager.Queue(self.worker_nums)
        # TODO (chongyi zheng): why plus one?
        self.start_barrier = mp.Barrier(self.worker_nums + 1)

        self.eval_workers = []
        self.eval_shared_que = self.manager.Queue(self.eval_worker_nums)
        # TODO (chongyi zheng): why plus one?
        self.eval_start_barrier = mp.Barrier(self.eval_worker_nums + 1)

        self.env_info.env_cls = self.env_cls
        self.env_info.env_args = self.env_args

        for i in range(self.worker_nums):
            self.env_info.env_rank = i
            p = mp.Process(
                target=self.__class__.train_worker_process,
                args=(self.__class__, self.shared_funcs, self.env_info,
                      self.replay_buffer, self.shared_que, self.start_barrier,
                      self.train_epochs)
            )  # collect training data for `train_epochs` each worker
            p.start()
            self.workers.append(p)

        for i in range(self.eval_worker_nums):
            eval_p = mp.Process(target=self.__class__.eval_worker_process,
                                args=(self.shared_funcs["pf"], self.env_info,
                                      self.eval_shared_que,
                                      self.eval_start_barrier,
                                      self.eval_epochs))
            eval_p.start()
            self.eval_workers.append(eval_p)
Esempio n. 2
0
    def start_worker(self):
        self.workers = []
        self.shared_que = self.manager.Queue(self.worker_nums)
        self.start_barrier = mp.Barrier(self.worker_nums)
    
        self.eval_workers = []
        self.eval_shared_que = self.manager.Queue(self.eval_worker_nums)
        self.eval_start_barrier = mp.Barrier(self.eval_worker_nums)

        self.env_info.env_cls  = self.env_cls
        self.env_info.env_args = self.env_args

        for i in range(self.worker_nums):
            self.env_info.env_rank = i
            p = mp.Process(
                target=self.__class__.train_worker_process,
                args=( self.__class__, self.shared_funcs,
                    self.env_info, self.replay_buffer, 
                    self.shared_que, self.start_barrier,
                    self.train_epochs))
            p.start()
            self.workers.append(p)

        for i in range(self.eval_worker_nums):
            eval_p = mp.Process(
                target=self.__class__.eval_worker_process,
                args=(self.shared_funcs["pf"],
                    self.env_info, self.eval_shared_que, self.eval_start_barrier,
                    self.eval_epochs, self.reset_idx))
            eval_p.start()
            self.eval_workers.append(eval_p)
Esempio n. 3
0
    def start_worker(self):
        self.workers = []
        self.shared_que = self.manager.Queue()
        self.start_barrier = mp.Barrier(self.worker_nums + 1)
        self.terminate_mark = mp.Value('i', 0)

        self.eval_workers = []
        self.eval_shared_que = self.manager.Queue()
        self.eval_start_barrier = mp.Barrier(self.eval_worker_nums + 1)

        for i in range(self.worker_nums):
            self.env_info.env_rank = i
            p = mp.Process(target=self.__class__.train_worker_process,
                           args=(self.__class__, self.funcs, self.env_info,
                                 self.replay_buffer, self.shared_que,
                                 self.start_barrier, self.terminate_mark))
            p.start()
            self.workers.append(p)

        for i in range(self.eval_worker_nums):
            eval_p = mp.Process(target=self.__class__.eval_worker_process,
                                args=(self.pf, self.env_info,
                                      self.eval_shared_que,
                                      self.eval_start_barrier,
                                      self.terminate_mark, self.reset_idx))
            eval_p.start()
            self.eval_workers.append(eval_p)
Esempio n. 4
0
def mp_trainer(np, model, grad_buffer, optimizer, it_num=0):
    if np is None:
        print("can not get num of process!")
        sys.exit(-1)

    #np trainers and an optmizer
    Barrier = mp.Barrier(np + 1)
    Condition = mp.Condition()

    p_opt_args = (np, it_num, Barrier, optimizer, Condition, model,
                  grad_buffer)
    p_opt = mp.Process(target=optimizer_process, args=p_opt_args)
    p_opt.start()

    processes = []
    processes.append(p_opt)

    shared_score = torch.FloatTensor([0])
    shared_score.share_memory_()

    for id in range(np):
        p_trainer_args = (id, it_num, Barrier, optimizer, Condition, model,
                          grad_buffer, shared_score, np)
        p_trainer = mp.Process(target=trainer_process, args=p_trainer_args)
        p_trainer.start()
        processes.append(p_trainer)

    for p in processes:
        p.join()
def spawn_cpu_servers(p, bounds_net):
    # Create child processes to parallelize the last layer bounds computations over cpu. Uses multiprocessing.
    servers_queue = mp.Queue()
    instruction_queue = mp.Queue()
    barrier = mp.Barrier(p)
    cpu_servers = mp.spawn(last_bounds_cpu_server,
                           args=(copy.deepcopy(bounds_net), servers_queue,
                                 instruction_queue, barrier),
                           nprocs=(p - 1),
                           join=False)
    return cpu_servers, servers_queue, instruction_queue, barrier
Esempio n. 6
0
    def start_worker(self):
        self.eval_workers = []
        self.eval_shared_que = self.manager.Queue(self.eval_worker_nums)
        self.eval_start_barrier = mp.Barrier(self.eval_worker_nums)

        # task_cls, task_args, env_params
        tasks = list(self.env_cls.keys())

        assert self.worker_nums == 0
        assert self.eval_worker_nums == self.env.num_tasks

        self.env_info.env = None
        self.env_info.num_tasks = self.env.num_tasks
        self.env_info.env_cls = generate_single_mt_env
        single_mt_env_args = {
            "task_cls": None,
            "task_args": None,
            "env_rank": 0,
            "num_tasks": self.env.num_tasks,
            "max_obs_dim": np.prod(self.env.observation_space.shape),
            "env_params": self.env_args[0],
            "meta_env_params": self.env_args[2]
        }

        for i, task in enumerate(tasks):
            env_cls = self.env_cls[task]

            self.env_info.env_rank = i

            self.env_info.env_args = single_mt_env_args
            self.env_info.env_args["task_cls"] = env_cls
            self.env_info.env_args["task_args"] = copy.deepcopy(
                self.env_args[1][task])

            start_epoch = 0
            if "start_epoch" in self.env_info.env_args["task_args"]:
                # start_epoch = self.env_info.env_args["task_args"]["start_epoch"]
                del self.env_info.env_args["task_args"]["start_epoch"]
            # else:
            # start_epoch = 0

            self.env_info.env_args["env_rank"] = i
            eval_p = mp.Process(target=self.__class__.eval_worker_process,
                                args=(self.shared_funcs["pf"], self.env_info,
                                      self.eval_shared_que,
                                      self.eval_start_barrier,
                                      self.eval_epochs, start_epoch, task))
            eval_p.start()
            self.eval_workers.append(eval_p)
 def __init__(self, cfg):
     super(ParallelExplorer, self).__init__()
     #
     # This must be set in the main.
     # mp.set_start_method('forkserver')
     self.processes = []
     self.comms = []
     self.followup = []
     self.replayBuffers = []
     self.curThread = 0
     self.nThreads = cfg.numEnv
     self.meanRewards = [-float('nan')] * self.nThreads
     self.numEps = [0] * self.nThreads
     self.nInBuffers = 0
     self.totSteps = 0
     self.maxBuffers = cfg.numFramesPerBuffer
     self.exploreSched = cfg.exploreSched
     self.model = cfg.model
     self.actionVec = torch.LongTensor(self.nThreads).zero_()
     self.actionVec.storage().share_memory_()
     self.threads = np.atleast_1d(np.arange(self.nThreads, dtype=np.int64))
     self.toTensorImg, self.toTensor, self.use_cuda = TensorConfig.getTensorConfiguration(
     )
     self.cfg = cfg
     self.barrier = mp.Barrier(self.nThreads + 1)
     #
     # How to sample.
     self.sampleFn = self._sampleRandom
     if cfg.sampleLatest:
         self.sampleFn = self._sampleLatest
     #
     # Sample from all threads.
     for idx in range(self.nThreads):
         print('Exploration: Actually set the seed properly.')
         sendP, subpipe = mp.Pipe()
         explorer = ExploreProcess(subpipe, cfg, idx, idx, self.actionVec,
                                   self.barrier)
         explorer.daemon = True
         explorer.start()
         self.processes.append(explorer)
         self.comms.append(sendP)
         self.replayBuffers.append(
             ReplayBuffer(cfg.numFramesPerBuffer, cfg.stackFrameLen))
         self.followup.append(idx)
     self.nAct = self.processes[0].env.action_space.n
     self.imshape = self.processes[0].env.observation_space.shape
     print('Parent PID: %d' % os.getpid())
Esempio n. 8
0
    def __init__(self, config):
        super(TrainManager, self).__init__()

        device = torch.device(
            'cuda') if torch.cuda.is_available() else torch.device('cpu')

        self.config = config
        self.training_config = self.config['training']

        if self.training_config['transfer']:
            self.target_model = load_model(self.training_config['model_path'])
        else:
            self.target_model = create_model(self.config['model'])
            alt_model = None
            if self.training_config.get('init_actor', False):
                alt_model = load_model(self.training_config['model_path'])
                self.target_model.init_actor(alt_model)
            if self.training_config.get('init_critic', False):
                if alt_model is None:
                    alt_model = load_model(self.training_config['model_path'])
                self.target_model.init_critic(alt_model)
            self._prime_model(self.target_model, device)

        self.models = []
        self.proxy_models = []

        for _ in range(self.training_config['num_threads_training']):
            model = copy.deepcopy(self.target_model)
            proxy_model = copy.deepcopy(self.target_model)
            self._prime_model(model, device)
            self._prime_model(proxy_model, device)
            self.models.append(model)
            self.proxy_models.append(proxy_model)

        self.processes = []

        self.episode_queues = [
            torch_mp.Queue(maxsize=128)
            for _ in range(self.training_config['num_threads_sampling'])
        ]

        self.sample_queues = [
            torch_mp.Queue(
                maxsize=self.training_config['sampling_queue_max_len'])
            for _ in range(self.training_config['num_threads_training'])
        ]

        self.action_conns = [
            torch_mp.Pipe(duplex=False) for _ in range(
                self.training_config['num_threads_exploring_virtual'])
        ]

        self.observation_conns = [
            torch_mp.Pipe(duplex=False) for _ in range(
                self.training_config['num_threads_exploring_virtual'])
        ]

        self.observation_queue = torch_mp.Queue()
        self.action_queue = torch_mp.Queue()

        self.start_barrier = torch_mp.Barrier(
            self.training_config['num_threads_training'])
        self.finish_barrier = torch_mp.Barrier(
            self.training_config['num_threads_training'])
        self.update_lock = torch_mp.Lock()

        self.best_reward = Value('f', 0.0)
        self.global_episode = Value('i', 0)
        self.global_update_step = Value('i', 0)
Esempio n. 9
0
            'Number of tasks provided does not match the number of batch sizes provided.'
        )

    n_gpus = int(args.n_gpus)
    n_tasks = len(tasks) * n_jobs

    shared_model = omninet.OmniNet(gpu_id=0)
    if restore != -1:
        shared_model.restore(model_save_path, restore)
    else:
        restore = 0

    shared_model = shared_model.to(0)
    shared_model.share_memory()
    counters = [Counter(restore) for i in range(len(tasks))]
    barrier = mp.Barrier(n_tasks)
    start = int(restore / n_jobs)
    # Declare training processes for multi-gpu hogwild training
    processes = []
    for i in range(n_tasks):
        #If more than one GPU is used, use first GPU only for model sharing
        if n_gpus > 1:
            gpu_id = i % n_gpus
        else:
            gpu_id = 0
        process = mp.Process(target=train,
                             args=(shared_model, tasks[i % len(tasks)],
                                   batch_sizes[i % len(tasks)],
                                   int(n_iters / n_jobs), gpu_id, start,
                                   restore, counters[i % len(tasks)], barrier,
                                   (save_interval if i == 0 else None),
Esempio n. 10
0
def start_worker(args, logger):
    """Start kvclient for training
    """
    init_time_start = time.time()
    time.sleep(WAIT_TIME)  # wait for launch script

    server_namebook = dgl.contrib.read_ip_config(filename=args.ip_config)

    args.machine_id = get_local_machine_id(server_namebook)

    dataset, entity_partition_book, local2global = get_partition_dataset(
        args.data_path, args.dataset, args.format, args.machine_id)

    n_entities = dataset.n_entities
    n_relations = dataset.n_relations

    print('Partition %d n_entities: %d' % (args.machine_id, n_entities))
    print("Partition %d n_relations: %d" % (args.machine_id, n_relations))

    entity_partition_book = F.tensor(entity_partition_book)
    relation_partition_book = get_long_tail_partition(dataset.n_relations,
                                                      args.total_machine)
    relation_partition_book = F.tensor(relation_partition_book)
    local2global = F.tensor(local2global)

    relation_partition_book.share_memory_()
    entity_partition_book.share_memory_()
    local2global.share_memory_()

    train_data = TrainDataset(dataset, args, ranks=args.num_client)
    # if there is no cross partition relaiton, we fall back to strict_rel_part
    args.strict_rel_part = args.mix_cpu_gpu and (train_data.cross_part
                                                 == False)
    args.soft_rel_part = args.mix_cpu_gpu and args.soft_rel_part and train_data.cross_part

    if args.neg_sample_size_eval < 0:
        args.neg_sample_size_eval = dataset.n_entities
    args.batch_size = get_compatible_batch_size(args.batch_size,
                                                args.neg_sample_size)
    args.batch_size_eval = get_compatible_batch_size(args.batch_size_eval,
                                                     args.neg_sample_size_eval)

    args.num_workers = 8  # fix num_workers to 8
    train_samplers = []
    for i in range(args.num_client):
        train_sampler_head = train_data.create_sampler(
            args.batch_size,
            args.neg_sample_size,
            args.neg_sample_size,
            mode='head',
            num_workers=args.num_workers,
            shuffle=True,
            exclude_positive=False,
            rank=i)
        train_sampler_tail = train_data.create_sampler(
            args.batch_size,
            args.neg_sample_size,
            args.neg_sample_size,
            mode='tail',
            num_workers=args.num_workers,
            shuffle=True,
            exclude_positive=False,
            rank=i)
        train_samplers.append(
            NewBidirectionalOneShotIterator(train_sampler_head,
                                            train_sampler_tail,
                                            args.neg_sample_size,
                                            args.neg_sample_size, True,
                                            n_entities))

    dataset = None

    model = load_model(logger, args, n_entities, n_relations)
    model.share_memory()

    print('Total initialize time {:.3f} seconds'.format(time.time() -
                                                        init_time_start))

    rel_parts = train_data.rel_parts if args.strict_rel_part or args.soft_rel_part else None
    cross_rels = train_data.cross_rels if args.soft_rel_part else None

    procs = []
    barrier = mp.Barrier(args.num_client)
    for i in range(args.num_client):
        proc = mp.Process(target=dist_train_test,
                          args=(args, model, train_samplers[i],
                                entity_partition_book, relation_partition_book,
                                local2global, i, rel_parts, cross_rels,
                                barrier))
        procs.append(proc)
        proc.start()
    for proc in procs:
        proc.join()
Esempio n. 11
0
def profile(run_config: RunConfig):
    barrier = mp.Barrier(run_config.total_models + 1)
    profile_data_path = os.path.join(run_config.data_path, "profile")
    os.makedirs(profile_data_path, exist_ok=True)
    for model_combination in gen_model_combinations(
        run_config.models_name,
        run_config.profiling_combinations,
    ):
        print(model_combination)
        profile_filename = model_combination[0]
        for model_name in model_combination[1:]:
            profile_filename = profile_filename + "_" + model_name
        profile_filename += ".csv"
        profile_file = open(os.path.join(profile_data_path, profile_filename), "w+")
        wr = csv.writer(profile_file, dialect="excel")
        profile_head = [
            "model",
            "start",
            "end",
            "bs",
            "seq_len",
        ] * run_config.total_models + [
            "median",
            "mean",
            "var",
        ]
        wr.writerow(profile_head)

        worker_list = []
        for worker_id, model_name in enumerate(model_combination):
            pipe_parent, pipe_child = mp.Pipe()
            model_worker = ProfilerWorker(
                run_config,
                model_name,
                run_config.supported_batchsize,
                run_config.supported_seqlen,
                pipe_child,
                barrier,
                worker_id,
            )
            model_worker.start()
            worker_list.append((model_worker, pipe_parent))
        barrier.wait()

        for bs_it in itertools.product(
            run_config.supported_batchsize, repeat=run_config.total_models
        ):
            model_ids = [i for i in range(run_config.total_models)]
            profiled_config = set()
            for test_i in range(run_config.total_test):
                model_config = []
                qos_query_cnt = random.randrange(1, run_config.total_models + 1)
                new_query_cnt = random.randrange(1, run_config.total_models + 1)
                qos_ids = random.sample(model_ids, qos_query_cnt)
                new_ids = random.sample(model_ids, new_query_cnt)
                for i in range(run_config.total_models):
                    start, end = gen_partition(
                        run_config.models_len[model_combination[i]],
                        True if i in qos_ids else False,
                        True if i in new_ids else False,
                    )
                    seq_len = (
                        random.choice(run_config.supported_seqlen)
                        if model_combination[i] == "bert"
                        else 0
                    )
                    model_config.append(
                        [model_combination[i], start, end, bs_it[i], seq_len]
                    )
                pendding_profile_config = tuple(tuple(i) for i in model_config)
                if pendding_profile_config in profiled_config:
                    print(
                        "Profiled model config: {}, {}, {}, {}, {},{}, {}, {}, {}, {}".format(
                            model_config[0][0],
                            model_config[0][1],
                            model_config[0][2],
                            model_config[0][3],
                            model_config[0][4],
                            model_config[1][0],
                            model_config[1][1],
                            model_config[1][2],
                            model_config[1][3],
                            model_config[1][4],
                        )
                    )
                else:
                    profiled_config.add(pendding_profile_config)
                    for i in range(run_config.total_models):
                        _, model_pipe = worker_list[i]
                        model_pipe.send(
                            (
                                model_config[i][0],
                                "prepare",
                                model_config[i][1],
                                model_config[i][2],
                                model_config[i][3],
                                model_config[i][4],
                            )
                        )
                    barrier.wait()
                    record = []
                    with tqdm(range(run_config.test_loop)) as t:
                        for loop_i in t:
                            start_time = datetime.datetime.now()
                            for i in range(run_config.total_models):
                                _, model_pipe = worker_list[i]
                                model_pipe.send(
                                    (
                                        model_config[i][0],
                                        "forward",
                                        model_config[i][1],
                                        model_config[i][2],
                                        model_config[i][3],
                                        model_config[i][4],
                                    )
                                )
                            # barrier.wait()
                            # start_time = datetime.datetime.now()
                            barrier.wait()
                            elapsed_time_us = (
                                datetime.datetime.now() - start_time
                            ).microseconds
                            t.set_postfix(elapsed=elapsed_time_us)
                            t.update(1)
                            record.append(elapsed_time_us)

                    profile_record = make_record(model_config, record)
                    wr.writerow(profile_record)
                    profile_file.flush()
        for i in range(run_config.total_models):
            _, model_pipe = worker_list[i]
            model_pipe.send(("none", "terminate", -1, -1, -1, -1))

        for worker, _ in worker_list:
            worker.join()
Esempio n. 12
0
def main():
    args = ArgParser().parse_args()
    prepare_save_path(args)
    assert args.dataset == 'wikikg90m'
    args.neg_sample_size_eval = 1000
    set_global_seed(args.seed)

    init_time_start = time.time()
    # load dataset and samplers
    dataset = get_dataset(args.data_path, args.dataset, args.format,
                          args.delimiter, args.data_files,
                          args.has_edge_importance)

    if args.neg_sample_size_eval < 0:
        args.neg_sample_size_eval = dataset.n_entities
    args.batch_size = get_compatible_batch_size(args.batch_size,
                                                args.neg_sample_size)
    args.batch_size_eval = get_compatible_batch_size(args.batch_size_eval,
                                                     args.neg_sample_size_eval)
    # We should turn on mix CPU-GPU training for multi-GPU training.
    if len(args.gpu) > 1:
        args.mix_cpu_gpu = True
        if args.num_proc < len(args.gpu):
            args.num_proc = len(args.gpu)
    # We need to ensure that the number of processes should match the number of GPUs.
    if len(args.gpu) > 1 and args.num_proc > 1:
        assert args.num_proc % len(args.gpu) == 0, \
                'The number of processes needs to be divisible by the number of GPUs'
    # For multiprocessing training, we need to ensure that training processes are synchronized periodically.
    if args.num_proc > 1:
        args.force_sync_interval = 1000

    args.eval_filter = not args.no_eval_filter
    if args.neg_deg_sample_eval:
        assert not args.eval_filter, "if negative sampling based on degree, we can't filter positive edges."

    args.soft_rel_part = args.mix_cpu_gpu and args.rel_part
    print("To build training dataset")
    t1 = time.time()
    train_data = TrainDataset(dataset,
                              args,
                              ranks=args.num_proc,
                              has_importance=args.has_edge_importance)
    print("Training dataset built, it takes %d seconds" % (time.time() - t1))
    # if there is no cross partition relaiton, we fall back to strict_rel_part
    args.strict_rel_part = args.mix_cpu_gpu and (train_data.cross_part
                                                 == False)
    args.num_workers = 8  # fix num_worker to 8
    set_logger(args)
    with open(os.path.join(args.save_path, args.encoder_model_name), 'w') as f:
        f.write(args.encoder_model_name)
    if args.num_proc > 1:
        train_samplers = []
        for i in range(args.num_proc):
            print("Building training sampler for proc %d" % i)
            t1 = time.time()
            # for each GPU, allocate num_proc // num_GPU processes
            train_sampler_head = train_data.create_sampler(
                args.batch_size,
                args.neg_sample_size,
                args.neg_sample_size,
                mode='head',
                num_workers=args.num_workers,
                shuffle=True,
                exclude_positive=False,
                rank=i)
            train_sampler_tail = train_data.create_sampler(
                args.batch_size,
                args.neg_sample_size,
                args.neg_sample_size,
                mode='tail',
                num_workers=args.num_workers,
                shuffle=True,
                exclude_positive=False,
                rank=i)
            train_samplers.append(
                NewBidirectionalOneShotIterator(train_sampler_head,
                                                train_sampler_tail,
                                                args.neg_sample_size,
                                                args.neg_sample_size, True,
                                                dataset.n_entities,
                                                args.has_edge_importance))
            print("Training sampler for proc %d created, it takes %s seconds" %
                  (i, time.time() - t1))

        train_sampler = NewBidirectionalOneShotIterator(
            train_sampler_head, train_sampler_tail, args.neg_sample_size,
            args.neg_sample_size, True, dataset.n_entities,
            args.has_edge_importance)
    else:  # This is used for debug
        train_sampler_head = train_data.create_sampler(
            args.batch_size,
            args.neg_sample_size,
            args.neg_sample_size,
            mode='head',
            num_workers=args.num_workers,
            shuffle=True,
            exclude_positive=False)
        train_sampler_tail = train_data.create_sampler(
            args.batch_size,
            args.neg_sample_size,
            args.neg_sample_size,
            mode='tail',
            num_workers=args.num_workers,
            shuffle=True,
            exclude_positive=False)
        train_sampler = NewBidirectionalOneShotIterator(
            train_sampler_head, train_sampler_tail, args.neg_sample_size,
            args.neg_sample_size, True, dataset.n_entities,
            args.has_edge_importance)

    if args.valid or args.test:
        if len(args.gpu) > 1:
            args.num_test_proc = args.num_proc if args.num_proc < len(
                args.gpu) else len(args.gpu)
        else:
            args.num_test_proc = args.num_proc
        print("To create eval_dataset")
        t1 = time.time()
        eval_dataset = EvalDataset(dataset, args)
        print("eval_dataset created, it takes %d seconds" % (time.time() - t1))

    if args.valid:
        if args.num_proc > 1:
            # valid_sampler_heads = []
            valid_sampler_tails = []
            for i in range(args.num_proc):
                print("creating valid sampler for proc %d" % i)
                t1 = time.time()
                # valid_sampler_head = eval_dataset.create_sampler('valid', args.batch_size_eval,
                #                                                   args.neg_sample_size_eval,
                #                                                   args.neg_sample_size_eval,
                #                                                   args.eval_filter,
                #                                                   mode='head',
                #                                                   num_workers=args.num_workers,
                #                                                   rank=i, ranks=args.num_proc)
                valid_sampler_tail = eval_dataset.create_sampler(
                    'valid',
                    args.batch_size_eval,
                    args.neg_sample_size_eval,
                    args.neg_sample_size_eval,
                    args.eval_filter,
                    mode='tail',
                    num_workers=args.num_workers,
                    rank=i,
                    ranks=args.num_proc)
                # valid_sampler_heads.append(valid_sampler_head)
                valid_sampler_tails.append(valid_sampler_tail)
                print(
                    "Valid sampler for proc %d created, it takes %s seconds" %
                    (i, time.time() - t1))
        else:  # This is used for debug
            # valid_sampler_head = eval_dataset.create_sampler('valid', args.batch_size_eval,
            #                                                  args.neg_sample_size_eval,
            #                                                  1,
            #                                                  args.eval_filter,
            #                                                  mode='head',
            #                                                  num_workers=args.num_workers,
            #                                                  rank=0, ranks=1)
            valid_sampler_tail = eval_dataset.create_sampler(
                'valid',
                args.batch_size_eval,
                args.neg_sample_size_eval,
                1,
                args.eval_filter,
                mode='tail',
                num_workers=args.num_workers,
                rank=0,
                ranks=1)
    if args.test:
        if args.num_test_proc > 1:
            test_sampler_tails = []
            # test_sampler_heads = []
            for i in range(args.num_test_proc):
                print("creating test sampler for proc %d" % i)
                t1 = time.time()
                # test_sampler_head = eval_dataset.create_sampler('test', args.batch_size_eval,
                #                                                  args.neg_sample_size_eval,
                #                                                  args.neg_sample_size_eval,
                #                                                  args.eval_filter,
                #                                                  mode='head',
                #                                                  num_workers=args.num_workers,
                #                                                  rank=i, ranks=args.num_test_proc)
                test_sampler_tail = eval_dataset.create_sampler(
                    'test',
                    args.batch_size_eval,
                    args.neg_sample_size_eval,
                    args.neg_sample_size_eval,
                    args.eval_filter,
                    mode='tail',
                    num_workers=args.num_workers,
                    rank=i,
                    ranks=args.num_test_proc)
                # test_sampler_heads.append(test_sampler_head)
                test_sampler_tails.append(test_sampler_tail)
                print("Test sampler for proc %d created, it takes %s seconds" %
                      (i, time.time() - t1))
        else:
            # test_sampler_head = eval_dataset.create_sampler('test', args.batch_size_eval,
            #                                                 args.neg_sample_size_eval,
            #                                                 1,
            #                                                 args.eval_filter,
            #                                                 mode='head',
            #                                                 num_workers=args.num_workers,
            #                                                 rank=0, ranks=1)
            test_sampler_tail = eval_dataset.create_sampler(
                'test',
                args.batch_size_eval,
                args.neg_sample_size_eval,
                1,
                args.eval_filter,
                mode='tail',
                num_workers=args.num_workers,
                rank=0,
                ranks=1)
    # pdb.set_trace()
    # load model
    print("To create model")
    t1 = time.time()
    model = load_model(args, dataset.n_entities, dataset.n_relations,
                       dataset.entity_feat.shape[1],
                       dataset.relation_feat.shape[1])
    if args.encoder_model_name in ['roberta', 'concat']:
        model.entity_feat.emb = dataset.entity_feat
        model.relation_feat.emb = dataset.relation_feat
    print("Model created, it takes %s seconds" % (time.time() - t1))
    model.evaluator = WikiKG90MEvaluator()

    if args.num_proc > 1 or args.async_update:
        model.share_memory()

    emap_file = dataset.emap_fname
    rmap_file = dataset.rmap_fname
    # We need to free all memory referenced by dataset.
    eval_dataset = None
    dataset = None

    print('Total initialize time {:.3f} seconds'.format(time.time() -
                                                        init_time_start))

    # train
    start = time.time()
    rel_parts = train_data.rel_parts if args.strict_rel_part or args.soft_rel_part else None
    cross_rels = train_data.cross_rels if args.soft_rel_part else None

    if args.num_proc > 1:
        procs = []
        barrier = mp.Barrier(args.num_proc)
        for i in range(args.num_proc):
            # valid_sampler = [valid_sampler_heads[i], valid_sampler_tails[i]] if args.valid else None
            # test_sampler = [test_sampler_heads[i], test_sampler_tails[i]] if args.test else None
            valid_sampler = [valid_sampler_tails[i]] if args.valid else None
            test_sampler = [test_sampler_tails[i]] if args.test else None
            proc = mp.Process(target=train_mp,
                              args=(
                                  args,
                                  model,
                                  train_samplers[i],
                                  valid_sampler,
                                  test_sampler,
                                  i,
                                  rel_parts,
                                  cross_rels,
                                  barrier,
                              ))
            procs.append(proc)
            proc.start()
        for proc in procs:
            proc.join()
    else:
        valid_samplers = [valid_sampler_tail] if args.valid else None
        test_samplers = [test_sampler_tail] if args.test else None
        # valid_samplers = [valid_sampler_head, valid_sampler_tail] if args.valid else None
        # test_samplers = [test_sampler_head, test_sampler_tail] if args.test else None
        train(args,
              model,
              train_sampler,
              valid_samplers,
              test_samplers,
              rel_parts=rel_parts)

    print('training takes {} seconds'.format(time.time() - start))
Esempio n. 13
0
def start_worker(args, logger):
    """Start kvclient for training
    """
    train_time_start = time.time()

    server_namebook = dgl.contrib.read_ip_config(filename=args.ip_config)

    args.machine_id = get_local_machine_id(server_namebook)

    dataset, entity_partition_book, local2global = get_partition_dataset(
        args.data_path,
        args.dataset,
        args.format,
        args.machine_id)

    n_entities = dataset.n_entities
    n_relations = dataset.n_relations

    print('Partition %d n_entities: %d' % (args.machine_id, n_entities))
    print("Partition %d n_relations: %d" % (args.machine_id, n_relations))

    entity_partition_book = F.tensor(entity_partition_book)
    relation_partition_book = get_long_tail_partition(dataset.n_relations, args.total_machine)
    relation_partition_book = F.tensor(relation_partition_book)
    local2global = F.tensor(local2global)

    relation_partition_book.share_memory_()
    entity_partition_book.share_memory_()
    local2global.share_memory_()

    model = load_model(logger, args, n_entities, n_relations)
    model.share_memory()

    # When we generate a batch of negative edges from a set of positive edges,
    # we first divide the positive edges into chunks and corrupt the edges in a chunk
    # together. By default, the chunk size is equal to the negative sample size.
    # Usually, this works well. But we also allow users to specify the chunk size themselves.
    if args.neg_chunk_size < 0:
        args.neg_chunk_size = args.neg_sample_size

    num_workers = NUM_WORKER
    train_data = TrainDataset(dataset, args, ranks=args.num_client)

    train_samplers = []
    for i in range(args.num_client):
        train_sampler_head = train_data.create_sampler(args.batch_size,
                                                       args.neg_sample_size,
                                                       args.neg_chunk_size,
                                                       mode='head',
                                                       num_workers=num_workers,
                                                       shuffle=True,
                                                       exclude_positive=False,
                                                       rank=i)
        train_sampler_tail = train_data.create_sampler(args.batch_size,
                                                       args.neg_sample_size,
                                                       args.neg_chunk_size,
                                                       mode='tail',
                                                       num_workers=num_workers,
                                                       shuffle=True,
                                                       exclude_positive=False,
                                                       rank=i)
        train_samplers.append(NewBidirectionalOneShotIterator(train_sampler_head, train_sampler_tail,
                                                              args.neg_chunk_size, args.neg_sample_size,
                                                              True, n_entities))

    dataset = None

    print('Total data loading time {:.3f} seconds'.format(time.time() - train_time_start))

    rel_parts = train_data.rel_parts if args.strict_rel_part or args.soft_rel_part else None
    cross_rels = train_data.cross_rels if args.soft_rel_part else None

    args.num_thread = NUM_THREAD

    procs = []
    barrier = mp.Barrier(args.num_client)
    for i in range(args.num_client):
        proc = mp.Process(target=dist_train_test, args=(args,
                                                        model,
                                                        train_samplers[i],
                                                        entity_partition_book,
                                                        relation_partition_book,
                                                        local2global,
                                                        i,
                                                        rel_parts,
                                                        cross_rels,
                                                        barrier))
        procs.append(proc)
        proc.start()
    for proc in procs:
        proc.join()
Esempio n. 14
0
    def train(
            self,
            args: Namespace,
            env_builder: Callable[[], Env],
            algo: RLAlgo
        ) -> None:
        """
        Trains the algorithm on the environment given using the argument
        namespace as parameters.
        
        "args" must have the following attributes:
        {
            experiment_path (str): The path to save experiment results and
                models.
            render (bool): Render the environment.
            steps_per_episode (Optional[int]): The number of steps in each
                episode.
            silent (bool): Will run without standard output from agents.
            action_mask (Optional[Tuple[bool, ...]]): The action mask to mask or
                unmask.
            masked (Optional[bool]): If an action mask is given, should be True
                if the returned agent actions are already masked.
            default_action (Optional[Tuple[float, ...]]): If an action mask is
                given and going from masked -> unmasked, this should be the
                default values for the actions.
            decay (float): The gamma decay for the target Q-values.
            n_steps (int): The number of decay steps.
            num_agents (int): The number of agents to run concurrently, 0 is
                single process.
            model_sync_interval (int): The number of training steps between
                agent model syncs, if 0, all processes will share the same
                model.
            num_prefetch_batches (int): The number of batches to prefetch to the
                learner in distributed learning.
            local_batch_size (int): The number of experiences the agent sends at
                once in distributed learning.
            vectorized (bool): If the environment is vectorized.
            recurrent (bool),Make the network recurrent (using LSTM)
            play (bool): Runs the environment using the model instead of
                training.
            exploration (str, ["rnd", "munchausen"]): The type of exploration to
                use.
		    episodes (int): The number of episodes to play for if playing.
            er_capacity (int): The alpha value for PER.
            batch_size (int): The batch size of the training set.
            training_steps (int): The number of training steps to train for.
            start_size (int): The size of the replay buffer before training.
            er_alpha (float): The alpha value for PER.
            er_beta (float): The alpha value for PER.
            er_beta_increment (float): The increment of the beta value on each
                sample for PER.
            er_epsilon (float): The epsilon value for PER.
            burn_in_length (int): If recurrent, the number of burn in samples
                for R2D2.
            sequence_length (int): If recurrent, the length of the sequence to
                train on.
            max_factor (int): If recurrent, factor of max priority to mean
                priority for R2D2.
        }

        Args:
            args: The namespace of arguments for training.
            env_builder: The nullary function to create the environment.
            algo: The algorithm to train.
        """
        logs_path = None
        save_path = None

        if args.experiment_path is not None:
            logs_path = Path(args.experiment_path, "logs")
            logs_path.mkdir(parents=True, exist_ok=True)
            logs_path = str(logs_path)

            save_path = Path(args.experiment_path, "models")
            save_path.mkdir(parents=True, exist_ok=True)
            save_path = str(save_path)

        # Create agent class
        agent_builder = partial(
            OffPolicyAgent, algo=algo, render=args.render, silent=args.silent
        )

        steps_per_episode = (
            args.steps_per_episode if "steps_per_episode" in args else None
        )

        agent_builder = compose(
            agent_builder,
            partial(TimeLimitAgent, max_steps=steps_per_episode)
        )

        if not args.play:
            # Experience replay
            # Won't increment in multiple processes to keep it consistent
            # across actors
            er_beta_increment = (
                args.er_beta_increment if args.num_agents == 0 else 0
            )

            if args.recurrent:
                experience_replay_func = partial(
                    TorchR2D2, alpha=args.er_alpha, beta=args.er_beta,
                    beta_increment=er_beta_increment, epsilon=args.er_epsilon,
                    max_factor=args.max_factor
                )
            else:
                experience_replay_func = partial(
                    TorchPER, alpha=args.er_alpha, beta=args.er_beta,
                    beta_increment=er_beta_increment, epsilon=args.er_epsilon
                )

            if args.num_agents > 0:
                recv_pipes = []
                send_pipes = []

                prestart_func = None

                if args.model_sync_interval == 0:
                    self._start_training(algo, args)
                    algo.share_memory()

                    recv_pipes = [None] * args.num_agents
                else:
                    prestart_func = partial(
                        self._start_training, algo=algo, args=args
                    )

                    # Force CPU for now to avoid re-instantiating cuda in
                    # subprocesses
                    algo.device = torch.device("cpu")
                    algo = algo.to(algo.device)

                    for i in range(args.num_agents):
                        param_pipe = mp.Pipe(duplex=False)

                        recv_pipes.append(param_pipe[0])
                        send_pipes.append(param_pipe[1])

                # Just needed to get the error/priority calculations
                dummy_experience_replay = experience_replay_func(capacity=1)

                # Must come before the other wrapper since there are infinite
                # recursion errors
                # TODO come up with a better way to implement wrappers
                agent_builder = compose(
                    agent_builder,
                    partial_iterator(
                        QueueAgent,
                        agent_id=(iter(range(args.num_agents)), True),
                        experience_replay=(dummy_experience_replay, False),
                        param_pipe=(iter(recv_pipes), True)
                    )
                )

        agent_builder = compose(
            agent_builder,
            partial(TorchRLAgent, batch_state=not args.vectorized)
        )
        
        if "action_mask" in args and args.action_mask:
            # TODO: Will have to add an action mask wrapper later
            if args.masked:
                agent_builder = compose(
                    agent_builder,
                    partial(
                        UnmaskedActionAgent, action_mask=args.action_mask,
                        default_action=args.default_action
                    )
                )

        agent_builder = compose(agent_builder, TorchOffPolicyAgent)

        if args.recurrent:
            agent_builder = compose(
                agent_builder, SequenceInputAgent, TorchRecurrentAgent
            )

        if args.play:
            algo = algo.to(args.device)
            algo.eval()

            agent_logger = (
                None if logs_path is None
                else TensorboardLogger(logs_path + "/play-agent")
            )

            agent = agent_builder(env=env_builder(), logger=agent_logger)
            agent.play(args.episodes)
        else:
            if args.exploration == "rnd":
                agent_builder = compose(agent_builder, IntrinsicRewardAgent)
            elif args.exploration == "munchausen":
                agent_builder = compose(
                    agent_builder, partial(MunchausenAgent, alpha=0.9)
                )

            algo.train()

            if args.recurrent:
                agent_builder = compose(
                    agent_builder,
                    partial(
                        ExperienceSequenceAgent,
                        sequence_length=(
                            args.burn_in_length + args.sequence_length
                        ),
                        overlap=args.burn_in_length
                    )
                )

            experience_replay = experience_replay_func(
                capacity=args.er_capacity
            )

            base_agent_logs_path = None
            if logs_path is not None:
                base_agent_logs_path = logs_path + "/train-agent"

            # Single process
            if args.num_agents == 0:
                self._start_training(algo, args)

                agent_logger = None
                if base_agent_logs_path is not None:
                    agent_logger = TensorboardLogger(base_agent_logs_path)

                agent = agent_builder(env=env_builder(), logger=agent_logger)

                agent.train(
                    args.episodes, 1, args.discount, args.n_steps,
                    experience_replay, args.batch_size, args.start_size,
                    save_path, args.save_interval
                )

            # Multiple processes
            else:
                done_event = mp.Event()

                # Number of agents + worker + learner
                queue_barrier = mp.Barrier(args.num_agents + 2)

                agent_queue = mp.Queue(
                    maxsize=args.num_prefetch_batches * args.num_agents * 4
                )
                sample_queue = mp.Queue(maxsize=args.num_prefetch_batches)
                priority_queue = mp.Queue(maxsize=args.num_prefetch_batches)

                learner_args = (dummy_experience_replay,)
                learner_train_args = (
                    algo, done_event, queue_barrier, args.training_steps,
                    sample_queue, priority_queue, send_pipes,
                    args.model_sync_interval, save_path, args.save_interval
                )

                worker = TorchApexWorker()
                worker_args = (
                    experience_replay, done_event, queue_barrier, agent_queue,
                    sample_queue, priority_queue, args.batch_size,
                    args.start_size
                )

                agent_builders = []
                agent_train_args = []
                agent_train_kwargs = []

                for i in range(args.num_agents):
                    agent_logger = None
                    if base_agent_logs_path is not None:
                        agent_logs_path = (
                            base_agent_logs_path + "-" + str(i + 1)
                        )
                        agent_logger = TensorboardLogger(agent_logs_path)

                    agent_builders.append(
                        partial(agent_builder, logger=agent_logger)
                    )

                    agent_train_args.append((
                        1, args.local_batch_size, args.discount, args.n_steps,
                        agent_queue, queue_barrier
                    ))
                    agent_train_kwargs.append({
                        "exit_condition": done_event.is_set
                    })

                runner = ApexRunner(done_event)
                runner.start(
                    learner_args, learner_train_args, worker, worker_args,
                    env_builder, agent_builders, agent_train_args,
                    agent_train_kwargs, prestart_func
                )
Esempio n. 15
0
        v.env = env
        v.env.canvas = v.canvas
        v.visualize()

    gen = env.run(True)
    gen.send(None)
    while True:
        try:
            gen.send((None, ))
        except StopIteration:
            #barrier.wait()
            torch.save(env.a3c_model.state_dict(),
                       "./tmp/model_%d_%d" % (env.game_no, os.getpid()))
            #barrier.wait()
            gen = env.run(True)
            gen.send(None)


if __name__ == '__main__':
    model = ActorCritic(5, 9, 64)
    import torch.nn.init as weight_init
    for name, param in model.named_parameters():
        weight_init.normal(param)
    model.share_memory()
    barrier = mp.Barrier(2)
    parallel.start_parallel(DoubleA3CPPOEnv,
                            model,
                            np=2,
                            func=test_without_gui,
                            args=barrier)