Example #1
0
def all_gather_list(data, group=None, max_size=16384):
    """Gathers arbitrary data from all nodes into a list.

    Similar to :func:`~torch.distributed.all_gather` but for arbitrary Python
    data. Note that *data* must be picklable and any CUDA tensors will be moved
    to CPU and returned on CPU as well.

    Args:
        data (Any): data from the local worker to be gathered on other workers
        group: group of the collective
        max_size (int, optional): maximum size of the data to be gathered
            across workers
    """
    from fairseq import utils

    if group is None:
        group = get_global_group()
    rank = get_rank(group=group)
    world_size = get_world_size(group=group)

    buffer_size = max_size * world_size
    if (not hasattr(all_gather_list, "_buffer")
            or all_gather_list._buffer.numel() < buffer_size):
        all_gather_list._buffer = torch.cuda.ByteTensor(buffer_size)
        all_gather_list._cpu_buffer = torch.ByteTensor(max_size).pin_memory()
    buffer = all_gather_list._buffer
    buffer.zero_()
    cpu_buffer = all_gather_list._cpu_buffer

    data = utils.move_to_cpu(data)
    enc = pickle.dumps(data)
    enc_size = len(enc)
    header_size = 4  # size of header that contains the length of the encoded data
    size = header_size + enc_size
    if size > max_size:
        raise ValueError("encoded data size ({}) exceeds max_size ({})".format(
            size, max_size))

    header = struct.pack(">I", enc_size)
    cpu_buffer[:size] = torch.ByteTensor(list(header + enc))
    start = rank * max_size
    buffer[start:start + size].copy_(cpu_buffer[:size])

    all_reduce(buffer, group=group)

    buffer = buffer.cpu()
    try:
        result = []
        for i in range(world_size):
            out_buffer = buffer[i * max_size:(i + 1) * max_size]
            (enc_size, ) = struct.unpack(
                ">I", bytes(out_buffer[:header_size].tolist()))
            if enc_size > 0:
                result.append(
                    pickle.loads(
                        bytes(out_buffer[header_size:header_size +
                                         enc_size].tolist())))
        return result
    except pickle.UnpicklingError:
        raise Exception(
            "Unable to unpickle data from other workers. all_gather_list requires all "
            "workers to enter the function together, so this error usually indicates "
            "that the workers have fallen out of sync somehow. Workers can fall out of "
            "sync if one of them runs out of memory, or if there are other conditions "
            "in your training script that can cause one worker to finish an epoch "
            "while other workers are still iterating over their portions of the data. "
            "Try rerunning with --ddp-backend=legacy_ddp and see if that helps."
        )
Example #2
0
def all_gather_list(data, group=None, max_size=16384):
    """Gathers arbitrary data from all nodes into a list.

    Similar to :func:`~torch.distributed.all_gather` but for arbitrary Python
    data. Note that *data* must be picklable.

    Args:
        data (Any): data from the local worker to be gathered on other workers
        group (optional): group of the collective
        max_size (int, optional): maximum size of the data to be gathered
            across workers
    """
    rank = get_rank()
    world_size = get_world_size()

    buffer_size = max_size * world_size
    if not hasattr(all_gather_list, '_buffer') or \
            all_gather_list._buffer.numel() < buffer_size:
        all_gather_list._buffer = torch.cuda.ByteTensor(buffer_size)
        all_gather_list._cpu_buffer = torch.ByteTensor(max_size).pin_memory()
    buffer = all_gather_list._buffer
    buffer.zero_()
    cpu_buffer = all_gather_list._cpu_buffer

    data = utils.move_to_cpu(data)
    enc = pickle.dumps(data)
    enc_size = len(enc)
    if enc_size + 2 > max_size:
        raise ValueError('encoded data exceeds max_size: {}'.format(enc_size +
                                                                    2))
    assert max_size < 255 * 256

    cpu_buffer[0] = enc_size // 255  # this encoding works for max_size < 65k
    cpu_buffer[1] = enc_size % 255
    cpu_buffer[2:enc_size + 2] = torch.ByteTensor(list(enc))
    start = rank * max_size
    size = enc_size + 2
    buffer[start:start + size].copy_(cpu_buffer[:size])

    all_reduce(buffer, group=group)
    try:
        result = []
        print(f"world_size:{world_size}")
        for i in range(world_size):
            out_buffer = buffer[i * max_size:(i + 1) * max_size]
            print(f"{type(out_buffer)} out_buffer:{out_buffer}")
            size = (255 * utils.item(out_buffer[0])) + utils.item(
                out_buffer[1])
            if size > 0:
                print(
                    f"{type(pickle.loads(bytes(out_buffer[2 : size + 2].tolist())))} pickle.loads:{pickle.loads(bytes(out_buffer[2 : size + 2].tolist()))}"
                )
                result.append(
                    pickle.loads(bytes(out_buffer[2:size + 2].tolist())))
        return result
    except pickle.UnpicklingError:
        raise Exception(
            'Unable to unpickle data from other workers. all_gather_list requires all '
            'workers to enter the function together, so this error usually indicates '
            'that the workers have fallen out of sync somehow. Workers can fall out of '
            'sync if one of them runs out of memory, or if there are other conditions '
            'in your training script that can cause one worker to finish an epoch '
            'while other workers are still iterating over their portions of the data.'
        )
Example #3
0
def save_state(
    filename,
    cfg: FairseqConfig,
    model_state_dict,
    criterion,
    optimizer,
    lr_scheduler,
    num_updates,
    optim_history=None,
    extra_state=None,
    **kwargs,
):
    from fairseq import utils

    if optim_history is None:
        optim_history = []
    if extra_state is None:
        extra_state = {}
    state_dict = {
        "cfg":
        cfg,
        "args":
        kwargs.get("args", None),
        "model":
        model_state_dict or {},
        "optimizer_history":
        optim_history + [{
            "criterion_name": criterion.__class__.__name__,
            "optimizer_name": optimizer.__class__.__name__,
            "lr_scheduler_state": lr_scheduler.state_dict(),
            "num_updates": num_updates,
        }],
        "extra_state":
        extra_state,
    }
    if utils.has_parameters(criterion):
        state_dict["criterion"] = criterion.state_dict()

    if cfg is None:
        cfg = state_dict["args"]
        assert cfg is not None, "must provide cfg or args"

    if isinstance(cfg, DictConfig):
        no_save_optimizer_state = cfg.checkpoint.no_save_optimizer_state
    else:
        no_save_optimizer_state = cfg.no_save_optimizer_state
    if not no_save_optimizer_state:
        state_dict["last_optimizer_state"] = optimizer.state_dict()

    # keep everything on CPU
    state_dict = utils.move_to_cpu(state_dict)

    if PathManager.supports_rename(filename):
        # do atomic save
        with PathManager.open(filename + ".tmp", "wb") as f:
            torch_persistent_save(state_dict, f)
        PathManager.rename(filename + ".tmp", filename)
    else:
        # fallback to non-atomic save
        with PathManager.open(filename, "wb") as f:
            torch_persistent_save(state_dict, f)
def main(args):
    state = checkpoint_utils.load_checkpoint_to_cpu(args.checkpoint)
    ns = state["args"]
    model = state["model"]
    ns.arch = "transformer_modular"

    if (args.encoder_attention_heads_active is None
            and args.decoder_attention_heads_active is None):
        raise ValueError(
            'Either --encoder-attention-heads-active or '
            '--decoder-attention-heads-active option must be set.')
    if args.encoder_attention_heads_active is None:
        args.encoder_attention_heads_active = args.decoder_attention_heads_active

    if args.encoder_modular_layer_indices is not None:
        ns.encoder_modular_layer_indices = "({})".format(
            args.encoder_modular_layer_indices)
        model = convert_model(model, ns, coder="encoder", att_type="self_attn")
    if args.decoder_modular_layer_indices is not None:
        ns.decoder_modular_layer_indices = "({})".format(
            args.decoder_modular_layer_indices)
        model = convert_model(model, ns, coder="decoder", att_type="self_attn")
        model = convert_model(model,
                              ns,
                              coder="decoder",
                              att_type="encoder_attn")

    ctrl_enc = ModularCtrl(ns.encoder_embed_dim,
                           ns.encoder_attention_heads,
                           args.encoder_attention_heads_active,
                           hidden_depth=args.ctrl_hidden_depth,
                           hidden_dim=args.ctrl_hidden_dim,
                           ctrl_type=args.ctrl_type)
    ns.module_ctrl_hidden_depth = args.ctrl_hidden_depth
    ns.module_ctrl_hidden_dim = args.ctrl_hidden_dim
    ns.module_ctrl_type = args.ctrl_type

    for k, v in ctrl_enc.state_dict().items():
        model["encoder.module_ctrl.{}".format(k)] = v

    if not args.share_encoder_ctrl:
        if args.decoder_attention_heads_active is None:
            raise ValueError("Missing ``decoder-attention-heads-active'' "
                             "when ``share-encoder-ctrl'' is disabled.")
        ns.share_encoder_ctrl = False
        ctrl_dec = ModularCtrl(ns.decoder_embed_dim,
                               ns.decoder_attention_heads,
                               args.decoder_attention_heads_active,
                               hidden_depth=args.ctrl_hidden_depth,
                               hidden_dim=args.ctrl_hidden_dim,
                               ctrl_type=args.ctrl_type)
        for k, v in ctrl_dec.state_dict().items():
            model["decoder.module_ctrl.{}".format(k)] = v
    else:
        ns.share_encoder_ctrl = True

    ns.arch = "transformer_modular"
    ns.criterion = "label_smoothed_cross_entropy_modular"
    ns.task = "translation_modular"
    ns.encoder_attention_heads_active = args.encoder_attention_heads_active

    state["args"] = ns
    state["model"] = model

    for i, _ in enumerate(state["optimizer_history"]):
        state["optimizer_history"][i][
            "criterion_name"] = 'LabelSmoothedCrossEntropyModularCriterion'

    state = utils.move_to_cpu(state)

    with PathManager.open(args.save_as, "wb") as f:
        checkpoint_utils.torch_persistent_save(state, f)