def init_process_group(strategy=None): nranks = ParallelEnv().nranks rank = ParallelEnv().local_rank is_master = True if rank == 0 else False pg_group = dist.init_parallel_env() return pg_group.process_group
def get_path_from_url(url, md5sum=None, check_exist=True): """ Download from given url to root_dir. if file or directory specified by url is exists under root_dir, return the path directly, otherwise download from url and decompress it, return the path. Args: url (str): download url md5sum (str): md5 sum of download package Returns: str: a local path to save downloaded models & weights & datasets. """ from paddle.fluid.dygraph.parallel import ParallelEnv assert is_url(url), "downloading from {} not a url".format(url) root_dir = PPGAN_HOME # parse path after download to decompress under root_dir fullpath = _map_path(url, root_dir) if osp.exists(fullpath) and check_exist and _md5check(fullpath, md5sum): logger.info("Found {}".format(fullpath)) else: if ParallelEnv().local_rank == 0: fullpath = _download(url, root_dir, md5sum) else: while not os.path.exists(fullpath): time.sleep(1) if ParallelEnv().local_rank == 0: if tarfile.is_tarfile(fullpath) or zipfile.is_zipfile(fullpath): fullpath = _decompress(fullpath) return fullpath
def __init__(self, model): super(StaticGraphAdapter, self).__init__() self.model = model # with `_build_once` gone, parameters are now created in `__init__` # so we need to keep track of the parameters already created self._startup_prog = fluid.default_startup_program() self._orig_prog = fluid.default_main_program() self._label_vars = {} # label variables self._input_vars = {} # label variables self._endpoints = {} self._loss_endpoint = None self._executor = None self._progs = {} self._compiled_progs = {} self._merge_count = { 'eval_total': 0, 'test_total': 0, 'eval_batch': 0, 'test_batch': 0 } self._nranks = ParallelEnv().nranks self._local_rank = ParallelEnv().local_rank
def get_path(url, root_dir, md5sum=None, check_exist=True): """ Download from given url to root_dir. if file or directory specified by url is exists under root_dir, return the path directly, otherwise download from url and decompress it, return the path. url (str): download url root_dir (str): root dir for downloading, it should be WEIGHTS_HOME or DATASET_HOME md5sum (str): md5 sum of download package """ assert is_url(url), "downloading from {} not a url".format(url) # parse path after download to decompress under root_dir fullpath = map_path(url, root_dir) exist_flag = False if osp.exists(fullpath) and check_exist and _md5check(fullpath, md5sum): exist_flag = True if ParallelEnv().local_rank == 0: logger.info("Found {}".format(fullpath)) else: if ParallelEnv().local_rank == 0: fullpath = _download(url, root_dir, md5sum) else: while not os.path.exists(fullpath): time.sleep(1) return fullpath, exist_flag
def init_process_group(strategy=None): nranks = ParallelEnv().nranks rank = ParallelEnv().local_rank is_master = True if rank == 0 else False store = paddle.fluid.core.TCPStore("127.0.0.1", 6173, is_master, nranks) pg_group = core.ProcessGroupHCCL(store, rank, nranks) return pg_group
def prepare_leveldb(self, input_file, leveldb_file, label_list, max_seq_length, tokenizer, line_processor=None, delimiter="\t", quotechar=None): def default_line_processor(line_id, line): assert len(line) == 2 text_a = line[0] label = line[1] return BertInputExample(str(line_id), text_a=text_a, text_b=None, label=label) if line_processor is None: line_processor = default_line_processor if ParallelEnv().nranks > 1: leveldb_file = leveldb_file + "_" + str(ParallelEnv().local_rank) if not os.path.exists(leveldb_file): print("putting data %s into leveldb %s" % (input_file, leveldb_file)) _example_num = 0 _db = leveldb.LevelDB(leveldb_file, create_if_missing=True) with io.open(input_file, "r", encoding="utf8") as f: reader = csv.reader(f, delimiter=delimiter, quotechar=quotechar) line_id = 0 for (_line_id, line) in enumerate(reader): if line_processor(str(_line_id), line) is None: continue line_str = delimiter.join(line) _db.Put( str(line_id).encode("utf8"), line_str.encode("utf8")) line_id += 1 _example_num += 1 _db.Put("_example_num_".encode("utf8"), str(_example_num).encode("utf8")) else: _db = leveldb.LevelDB(leveldb_file, create_if_missing=False) self.label_list = label_list self.max_seq_length = max_seq_length self.tokenizer = tokenizer self.delimiter = delimiter self._db = _db self._line_processor = line_processor
def main(): device = set_device(FLAGS.device) fluid.enable_dygraph(device) if FLAGS.dynamic else None model_list = [x for x in models.__dict__["__all__"]] assert FLAGS.arch in model_list, "Expected FLAGS.arch in {}, but received {}".format( model_list, FLAGS.arch) model = models.__dict__[FLAGS.arch]( pretrained=FLAGS.eval_only and not FLAGS.resume) if FLAGS.resume is not None: model.load(FLAGS.resume) inputs = [Input([None, 3, 224, 224], 'float32', name='image')] labels = [Input([None, 1], 'int64', name='label')] train_dataset = ImageNetDataset(os.path.join(FLAGS.data, 'train'), mode='train', image_size=FLAGS.image_size, resize_short_size=FLAGS.resize_short_size) val_dataset = ImageNetDataset(os.path.join(FLAGS.data, 'val'), mode='val', image_size=FLAGS.image_size, resize_short_size=FLAGS.resize_short_size) optim = make_optimizer(np.ceil( len(train_dataset) * 1. / FLAGS.batch_size / ParallelEnv().nranks), parameter_list=model.parameters()) model.prepare(optim, CrossEntropy(), Accuracy(topk=(1, 5)), inputs, labels, FLAGS.device) if FLAGS.eval_only: model.evaluate(val_dataset, batch_size=FLAGS.batch_size, num_workers=FLAGS.num_workers) return output_dir = os.path.join( FLAGS.output_dir, FLAGS.arch, time.strftime('%Y-%m-%d-%H-%M', time.localtime())) if ParallelEnv().local_rank == 0 and not os.path.exists(output_dir): os.makedirs(output_dir) model.fit(train_dataset, val_dataset, batch_size=FLAGS.batch_size, epochs=FLAGS.epoch, save_dir=output_dir, num_workers=FLAGS.num_workers)
def get_path_from_url(url, root_dir, md5sum=None, check_exist=True, decompress=True, method='get'): """ Download from given url to root_dir. if file or directory specified by url is exists under root_dir, return the path directly, otherwise download from url and decompress it, return the path. Args: url (str): download url root_dir (str): root dir for downloading, it should be WEIGHTS_HOME or DATASET_HOME md5sum (str): md5 sum of download package decompress (bool): decompress zip or tar file. Default is `True` method (str): which download method to use. Support `wget` and `get`. Default is `get`. Returns: str: a local path to save downloaded models & weights & datasets. """ from paddle.fluid.dygraph.parallel import ParallelEnv assert is_url(url), "downloading from {} not a url".format(url) # parse path after download to decompress under root_dir fullpath = _map_path(url, root_dir) # Mainly used to solve the problem of downloading data from different # machines in the case of multiple machines. Different ips will download # data, and the same ip will only download data once. unique_endpoints = _get_unique_endpoints( ParallelEnv().trainer_endpoints[:]) if osp.exists(fullpath) and check_exist and _md5check(fullpath, md5sum): logger.info("Found {}".format(fullpath)) else: if ParallelEnv().current_endpoint in unique_endpoints: fullpath = _download(url, root_dir, md5sum, method=method) else: while not os.path.exists(fullpath): time.sleep(1) if ParallelEnv().current_endpoint in unique_endpoints: if decompress and (tarfile.is_tarfile(fullpath) or zipfile.is_zipfile(fullpath)): fullpath = _decompress(fullpath) return fullpath
def main(args): env_info = get_environ_info() places = fluid.CUDAPlace(ParallelEnv().dev_id) \ if env_info['place'] == 'cuda' and fluid.is_compiled_with_cuda() \ else fluid.CPUPlace() if args.dataset not in DATASETS: raise Exception( '`--dataset` is invalid. it should be one of {}'.format( str(list(DATASETS.keys())))) dataset = DATASETS[args.dataset] with fluid.dygraph.guard(places): eval_transforms = T.Compose([T.Resize(args.input_size), T.Normalize()]) eval_dataset = dataset(dataset_root=args.dataset_root, transforms=eval_transforms, mode='val') if args.model_name not in MODELS: raise Exception( '`--model_name` is invalid. it should be one of {}'.format( str(list(MODELS.keys())))) model = MODELS[args.model_name](num_classes=eval_dataset.num_classes) evaluate(model, eval_dataset, model_dir=args.model_dir, num_classes=eval_dataset.num_classes)
def on_epoch_begin(self, epoch=None, logs=None): self.steps = self.params['steps'] self.epoch = epoch self.train_step = 0 if self.verbose and self.epochs and ParallelEnv().local_rank == 0: print('Epoch %d/%d' % (epoch + 1, self.epochs)) self.train_progbar = ProgressBar(num=self.steps, verbose=self.verbose)
def main(args): env_info = get_environ_info() places = fluid.CUDAPlace(ParallelEnv().dev_id) \ if env_info['Paddle compiled with cuda'] and env_info['GPUs used'] \ else fluid.CPUPlace() if args.dataset not in DATASETS: raise Exception( '`--dataset` is invalid. it should be one of {}'.format( str(list(DATASETS.keys())))) dataset = DATASETS[args.dataset] with fluid.dygraph.guard(places): test_transforms = T.Compose([T.Resize(args.input_size), T.Normalize()]) test_dataset = dataset(dataset_root=args.dataset_root, transforms=test_transforms, mode='test') model = manager.MODELS[args.model_name]( num_classes=test_dataset.num_classes) infer(model, model_dir=args.model_dir, test_dataset=test_dataset, save_dir=args.save_dir)
def _md5check(fullname, md5sum=None): if md5sum is None: return True if ParallelEnv().local_rank == 0: logger.info("File {} md5 checking...".format(fullname)) md5 = hashlib.md5() with open(fullname, 'rb') as f: for chunk in iter(lambda: f.read(4096), b""): md5.update(chunk) calc_md5sum = md5.hexdigest() if calc_md5sum != md5sum: if ParallelEnv().local_rank == 0: logger.info("File {} md5 check failed, {}(calc) != " "{}(base)".format(fullname, calc_md5sum, md5sum)) return False return True
def on_train_batch_end(self, step, logs=None): logs = logs or {} self.train_step += 1 if self.train_step % self.log_freq == 0 and self.verbose and ParallelEnv( ).local_rank == 0: if self.steps is None or self.train_step < self.steps: self._updates(logs, 'train')
def main(): device = paddle.set_device(FLAGS.device) paddle.disable_static(device) if FLAGS.dynamic else None train_transform = Compose([ GroupScale(), GroupMultiScaleCrop(), GroupRandomCrop(), GroupRandomFlip(), NormalizeImage() ]) train_dataset = KineticsDataset( file_list=os.path.join(FLAGS.data, 'train_10.list'), pickle_dir=os.path.join(FLAGS.data, 'train_10'), label_list=os.path.join(FLAGS.data, 'label_list'), transform=train_transform) val_transform = Compose( [GroupScale(), GroupCenterCrop(), NormalizeImage()]) val_dataset = KineticsDataset( file_list=os.path.join(FLAGS.data, 'val_10.list'), pickle_dir=os.path.join(FLAGS.data, 'val_10'), label_list=os.path.join(FLAGS.data, 'label_list'), mode='val', transform=val_transform) pretrained = FLAGS.eval_only and FLAGS.weights is None model = tsm_resnet50(num_classes=train_dataset.num_classes, pretrained=pretrained) step_per_epoch = int(len(train_dataset) / FLAGS.batch_size \ / ParallelEnv().nranks) optim = make_optimizer(step_per_epoch, model.parameters()) model.prepare(optimizer=optim, loss=paddle.nn.CrossEntropyLoss(), metrics=paddle.metric.Accuracy(topk=(1, 5))) if FLAGS.eval_only: if FLAGS.weights is not None: model.load(FLAGS.weights, reset_optimizer=True) model.evaluate(val_dataset, batch_size=FLAGS.batch_size, num_workers=FLAGS.num_workers) return if FLAGS.resume is not None: model.load(FLAGS.resume) model.fit(train_data=train_dataset, eval_data=val_dataset, epochs=FLAGS.epoch, batch_size=FLAGS.batch_size, save_dir=FLAGS.save_dir or 'tsm_checkpoint', num_workers=FLAGS.num_workers, drop_last=True, shuffle=True)
def on_eval_begin(self, logs=None): self.eval_steps = logs.get('steps', None) self.eval_metrics = logs.get('metrics_name', []) self.eval_step = 0 self.evaled_samples = 0 self.eval_progbar = ProgressBar(num=self.eval_steps, verbose=self.verbose) if ParallelEnv().local_rank == 0: print('Eval begin...')
def set_device(device): assert isinstance(device, six.string_types) and device.lower() in ['cpu', 'gpu'], \ "Expected device in ['cpu', 'gpu'], but got {}".format(device) place = fluid.CUDAPlace(ParallelEnv().dev_id) \ if device.lower() == 'gpu' and fluid.is_compiled_with_cuda() \ else fluid.CPUPlace() return place
def log(level=2, message=""): if ParallelEnv().local_rank == 0: current_time = time.time() time_array = time.localtime(current_time) current_time = time.strftime("%Y-%m-%d %H:%M:%S", time_array) if log_level >= level: print( "{} [{}]\t{}".format(current_time, levels[level], message).encode("utf-8").decode("latin1")) sys.stdout.flush()
def __init__(self, dataset, batch_size, shuffle=False, drop_last=True, seed=None): self._dataset = dataset self._batch_size = batch_size self._shuffle = shuffle self._drop_last = drop_last self._random = np.random self._random.seed(seed) self._nranks = ParallelEnv().nranks self._local_rank = ParallelEnv().local_rank self._device_id = ParallelEnv().dev_id self._num_samples = int( math.ceil(len(self._dataset) * 1.0 / self._nranks)) self._total_size = self._num_samples * self._nranks self._epoch = 0
def on_eval_batch_end(self, step, logs=None): logs = logs or {} self.eval_step += 1 samples = logs.get('batch_size', 1) self.evaled_samples += samples if self.eval_step % self.log_freq == 0 and self.verbose and ParallelEnv( ).local_rank == 0: if self.eval_steps is None or self.eval_step < self.eval_steps: self._updates(logs, 'eval')
def __init__(self, dataset, batch_size, shuffle=False, drop_last=False): self.dataset = dataset assert isinstance(batch_size, int) and batch_size > 0, \ "batch_size should be a positive integer" self.batch_size = batch_size assert isinstance(shuffle, bool), \ "shuffle should be a boolean value" self.shuffle = shuffle assert isinstance(drop_last, bool), \ "drop_last should be a boolean number" self.drop_last = drop_last self.nranks = ParallelEnv().nranks self.local_rank = ParallelEnv().local_rank self.epoch = 0 self.num_samples = int(math.ceil( len(self.dataset) * 1.0 / self.nranks)) self.total_size = self.num_samples * self.nranks
def __init__(self, dataset, batch_size, num_replicas=None, rank=None, shuffle=False, drop_last=False, consumed_samples=0): self.dataset = dataset assert isinstance(batch_size, int) and batch_size > 0, \ "batch_size should be a positive integer" self.batch_size = batch_size assert isinstance(shuffle, bool), \ "shuffle should be a boolean value" self.shuffle = shuffle assert isinstance(drop_last, bool), \ "drop_last should be a boolean number" from paddle.fluid.dygraph.parallel import ParallelEnv if num_replicas is not None: assert isinstance(num_replicas, int) and num_replicas > 0, \ "num_replicas should be a positive integer" self.nranks = num_replicas else: self.nranks = ParallelEnv().nranks if rank is not None: assert isinstance(rank, int) and rank >= 0, \ "rank should be a non-negative integer" self.local_rank = rank else: self.local_rank = ParallelEnv().local_rank self.drop_last = drop_last self.epoch = 0 self.consumed_samples = consumed_samples self.num_samples = int(math.ceil( len(self.dataset) * 1.0 / self.nranks)) self.total_size = self.num_samples * self.nranks
def on_eval_batch_end(self, step, logs=None): logs = logs or {} self.eval_step = step samples = logs.get('batch_size', 1) self.evaled_samples += samples if self.eval_step % self.log_freq == 0 and self.verbose and ParallelEnv( ).local_rank == 0: # if steps is not None, last step will update in on_epoch_end if self.eval_steps and self.eval_step < self.eval_steps: self._updates(logs, 'eval')
def on_train_batch_end(self, step, logs=None): logs = logs or {} self.train_step += 1 if self.train_step % self.log_freq == 0 and self.verbose and ParallelEnv( ).local_rank == 0: # if steps is not None, last step will update in on_epoch_end if self.steps and self.train_step < self.steps: self._updates(logs, 'train') else: self._updates(logs, 'train')
def test_dygraph_gloo_init(): """test gloo init and broadcast""" paddle.distributed.init_parallel_env() if ParallelEnv().local_rank == 0: np_data = np.array([4, 5]) else: np_data = np.array([1, 2]) data = paddle.to_tensor(np_data) paddle.distributed.broadcast(data, 1) res = data.numpy() assert res == [1, 2]
def __init__(self, dataset, batch_size, pool_size=10000, sort_type=SortType.NONE, min_length=0, max_length=100, shuffle=False, shuffle_batch=False, use_token_batch=False, clip_last_batch=False, distribute_mode=True, seed=0): for arg, value in locals().items(): if arg != "self": setattr(self, "_" + arg, value) self._random = np.random self._random.seed(seed) # for multi-devices self._distribute_mode = distribute_mode self._nranks = ParallelEnv().nranks self._local_rank = ParallelEnv().local_rank self._device_id = ParallelEnv().dev_id
def _download(url, path, md5sum=None): """ Download from url, save to path. url (str): download url path (str): download to given path """ if not osp.exists(path): os.makedirs(path) fname = osp.split(url)[-1] fullname = osp.join(path, fname) retry_cnt = 0 while not (osp.exists(fullname) and _md5check(fullname, md5sum)): if retry_cnt < DOWNLOAD_RETRY_LIMIT: retry_cnt += 1 else: raise RuntimeError("Download from {} failed. " "Retry limit reached".format(url)) if ParallelEnv().local_rank == 0: logger.info("Downloading {} from {}".format(fname, url)) req = requests.get(url, stream=True) if req.status_code != 200: raise RuntimeError("Downloading from {} failed with code " "{}!".format(url, req.status_code)) # For protecting download interupted, download to # tmp_fullname firstly, move tmp_fullname to fullname # after download finished tmp_fullname = fullname + "_tmp" total_size = req.headers.get('content-length') with open(tmp_fullname, 'wb') as f: if total_size: for chunk in tqdm.tqdm( req.iter_content(chunk_size=1024), total=(int(total_size) + 1023) // 1024, unit='KB'): f.write(chunk) else: for chunk in req.iter_content(chunk_size=1024): if chunk: f.write(chunk) shutil.move(tmp_fullname, fullname) return fullname
def set_device(device): """ Paddle supports running calculations on various types of devices, including CPU and GPU. They are represented by string identifiers. This function can specify the global device which the OP will run. Parameters: device(str): This parameter determines the specific running device. It can be ``cpu`` or ``gpu:0``. When ``device`` is ``cpu``, the program is running on the cpu. When ``device`` is ``gpu``, the program is running ont the gpu. Examples: .. code-block:: python import paddle paddle.disable_static() paddle.set_device("cpu") x1 = paddle.ones(name='x1', shape=[1, 2], dtype='int32') x2 = paddle.zeros(name='x2', shape=[1, 2], dtype='int32') data = paddle.stack([x1,x2], axis=1) """ lower_device = device.lower() if lower_device == 'cpu': place = core.CPUPlace() elif lower_device == 'gpu': if not core.is_compiled_with_cuda(): raise ValueError( "The device should not be 'gpu', " \ "since PaddlePaddle is not compiled with CUDA") place = core.CUDAPlace(ParallelEnv().dev_id) else: avaliable_device = re.match(r'gpu:\d+', lower_device) if not avaliable_device: raise ValueError( "The device must be a string which is like 'cpu', 'gpu' or 'gpu:0'" ) if not core.is_compiled_with_cuda(): raise ValueError( "The device should not be {}, since PaddlePaddle is " \ "not compiled with CUDA".format(avaliable_device)) device_info_list = device.split(':', 1) device_id = device_info_list[1] device_id = int(device_id) place = core.CUDAPlace(device_id) framework._set_expected_place(place) return place
def setup_logger(output=None, name="hapi", log_level=logging.INFO): """ Initialize logger of hapi and set its verbosity level to "INFO". Args: output (str): a file name or a directory to save log. If None, will not save log file. If ends with ".txt" or ".log", assumed to be a file name. Otherwise, logs will be saved to `output/log.txt`. name (str): the root module name of this logger. Default: 'hapi'. log_level (enum): log level. eg.'INFO', 'DEBUG', 'ERROR'. Default: logging.INFO. Returns: logging.Logger: a logger """ logger = logging.getLogger(name) logger.propagate = False logger.setLevel(log_level) format_str = '%(asctime)s - %(name)s - %(levelname)s - %(message)s' # stdout logging: only local rank==0 local_rank = ParallelEnv().local_rank if local_rank == 0 and len(logger.handlers) == 0: ch = logging.StreamHandler(stream=sys.stdout) ch.setLevel(log_level) ch.setFormatter(logging.Formatter(format_str)) logger.addHandler(ch) # file logging if output is not None: all workers if output is not None: if output.endswith(".txt") or output.endswith(".log"): filename = output else: filename = os.path.join(output, "log.txt") if local_rank > 0: filename = filename + ".rank{}".format(local_rank) if not os.path.exists(os.path.dirname(filename)): os.makedirs(os.path.dirname(filename)) fh = logging.StreamHandler(filename) fh.setLevel(log_level) fh.setFormatter(logging.Formatter(format_str)) logger.addHandler(fh) return logger
def get_world_size(): """ Returns the number of trainers (number of processes participating in current job). Its value is equal to the value of the environment variable ``PADDLE_TRAINERS_NUM`` . The default value is 1. Returns: (int) The number of trainers. Examples: .. code-block:: python import paddle import paddle.distributed as dist # execute this command in terminal: export PADDLE_TRAINERS_NUM=4 print("The world_size is %d" % dist.get_world_size()) # The world_size is 4 """ return ParallelEnv().world_size
def get_rank(): """ Returns the rank of current trainer. Its value is equal to the value of the environment variable ``PADDLE_TRAINER_ID`` . The default value is 0. Returns: (int) The rank of current trainer. Examples: .. code-block:: python import paddle import paddle.distributed as dist # execute this command in terminal: export PADDLE_TRAINER_ID=0 print("The rank is %d" % dist.get_rank()) # The rank is 0 """ return ParallelEnv().rank