Beispiel #1
0
def make_model(args, device, ntokens):
    ninp = 2048  # embedding dimension
    nhid = 2048  # the dimension of the feedforward network model in nn.TransformerEncoder
    nhead = 32  # the number of heads in the multiheadattention models
    dropout = 0
    initrange = 0.1
    ndecoder = args.num_decoder_layers

    if args.lazy_construction:
        layers = [
            lambda: EmbeddingLayer(ntokens, ninp, initrange),
            lambda: PositionalEncodingLayer(ninp, dropout),
        ]
        for _ in range(ndecoder):
            layers.append(
                lambda: TransformerDecoderLayer(ninp, nhead, nhid, dropout))

        layers.append(lambda: LinearLayer(ninp, ntokens, initrange))
        model = layers
    else:
        model = TransformerLMSequntial(ntokens, ninp, nhead, nhid, dropout,
                                       initrange, ndecoder).to(device)

    criterion = nn.CrossEntropyLoss()
    lr = 0.01  # learning rate

    def make_adam(model):
        return Adam(model.parameters(), lr=lr)

    optimizer = make_adam
    scaler = GradScaler()

    return model, criterion, optimizer, scaler
Beispiel #2
0
 def get_model_config():
     return {
         "vocab_size": 10000,
         "ninp": 2048,  # embedding dimension
         "nhid": 2048,  # the dimension of the feedforward network model in nn.TransformerEncoder
         "nhead": 32,  # the number of heads in the multiheadattention models
         "dropout": 0,
         "initrange": 0.1,
         "scaler": GradScaler(),
         "clip_value": 0.05,
         "num_decoder_layers": 10,
         "seq_len": 32,
     }
Beispiel #3
0
def test_step_with_grad_scaler():
    weight, bias, input = make_half_precision_params()
    optimizer = Adam([weight, bias], lr=1e-3, precision=Precision.PURE_FP16)
    scaler = GradScaler()
    initial_value = None

    for _i in range(5):
        optimizer.zero_grad()
        loss = (weight.mv(input) + bias).pow(2).sum()
        if _i == 0:
            initial_value = loss.item()
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()

    assert loss.item() < initial_value
Beispiel #4
0
def get_benchmark_config():

    return {
        "epochs": 1,
        "vocab_size": 10000,
        "ninp": 2048,  # embedding dimension
        "nhid":
        2048,  # the dimension of the feedforward network model in nn.TransformerEncoder
        "nhead": 32,  # the number of heads in the multiheadattention models
        "dropout": 0,
        "initrange": 0.1,
        "criterion": nn.CrossEntropyLoss(),
        "lr": 0.001,  # learning rate
        "scaler": GradScaler(),
        "clip_value": 0.05,
        "batch_size": 8,
    }
Beispiel #5
0
def create_benchmark_config(model_name):
    """Return a dict with configurations required for benchmarking `model_name` model."""

    if model_name == "lm":
        return {
            "vocab_size": 10000,
            "ninp": 2048,  # embedding dimension
            "nhid": 2048,  # the dimension of the feedforward network model in nn.TransformerEncoder
            "nhead": 32,  # the number of heads in the multiheadattention models
            "dropout": 0,
            "initrange": 0.1,
            "criterion": nn.CrossEntropyLoss(),
            "lr": 0.01,  # learning rate
            "scaler": GradScaler(),
            "clip_value": 0.05,
        }
    else:
        raise RuntimeError("Unrecognized args.model_mame " % args.model_name)
Beispiel #6
0
def make_model(device, ntokens):
    ninp = 50  # embedding dimension
    nhid = 50  # the dimension of the feedforward network model in nn.TransformerEncoder
    nhead = 2  # the number of heads in the multiheadattention models
    dropout = 0
    initrange = 0.1

    model = TransformerLMSequntial(ntokens, ninp, nhead, nhid, dropout, initrange).half().to(device)
    balance = generate_balance(min(num_devices, 4), len(model))
    p = Pipe(model, balance, chunks=len(balance))

    criterion = nn.CrossEntropyLoss()
    lr = 0.001  # learning rate

    try:
        optimizer = Adam(p.parameters(), lr=lr, precision=Precision.PURE_FP16)
    except NameError:
        optimizer = Adam(p.parameters(), lr=lr)
    scaler = GradScaler()

    return p, criterion, optimizer, scaler