Esempio n. 1
0
def _gcnn_block(input):
    output = GatedCNN()(input)
    return output  
Esempio n. 2
0
def train(gpu, args):
    rank = args.nr * args.gpus + gpu
    dist.init_process_group(backend='nccl',
                            init_method='env://',
                            world_size=args.world_size,
                            rank=rank)
    torch.manual_seed(0)

    words = read_words(
        '/users/PAS1588/liuluyu0378/lab1/1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled',
        seq_len, kernel[0])
    word_counter = collections.Counter(words).most_common(vocab_size - 1)
    vocab = [w for w, _ in word_counter]
    w2i = dict((w, i) for i, w in enumerate(vocab, 1))
    w2i['<unk>'] = 0
    print('vocab_size', vocab_size)
    print('w2i size', len(w2i))

    data = [w2i[w] if w in w2i else 0 for w in words]
    data = create_batches(data, batch_size, seq_len)
    split_idx = int(len(data) * 0.8)
    training_data = data[:split_idx]
    test_data = data[split_idx:]
    print('train samples:', len(training_data))
    print('test samples:', len(test_data))

    model = GatedCNN(seq_len, vocab_size, embd_size, n_layers, kernel, out_chs,
                     res_block_count, vocab_size)

    torch.cuda.set_device(gpu)
    model.cuda(gpu)
    # define loss function (criterion) and optimizer
    criterion = nn.CrossEntropyLoss().cuda(gpu)
    optimizer = torch.optim.SGD(model.parameters(), 1e-4)
    # Wrap the model
    model = nn.parallel.DataParallel(model, device_ids=[gpu])
    print("model transfered")

    optimizer = torch.optim.Adadelta(model.parameters())
    loss_fn = nn.NLLLoss()
    # Data loading code

    train_sampler = torch.utils.data.distributed.DistributedSampler(
        training_data, num_replicas=args.world_size, rank=rank)
    train_loader = torch.utils.data.DataLoader(dataset=training_data,
                                               batch_size=batch_size,
                                               shuffle=False,
                                               num_workers=0,
                                               pin_memory=True,
                                               sampler=train_sampler)

    start = datetime.now()
    total_step = len(train_loader)

    print("loaded")
    for epoch in range(args.epochs):
        a = time.time()
        print('----epoch', epoch)
        # random.shuffle(data)
        # print(len(data))
        for batch_ct, (X, Y) in enumerate(train_loader):
            X = to_var(torch.LongTensor(X))  # (bs, seq_len)
            Y = to_var(torch.LongTensor(Y))  # (bs,)
            # print(X.size(), Y.size())
            # print(X)
            # print(batch_ct, X.size(), Y.size())
            pred = model(X)  # (bs, ans_size)
            # _, pred_ids = torch.max(pred, 1)
            loss = loss_fn(pred, Y)
            if batch_ct % 100 == 0:
                print('loss: {:.4f}'.format(loss.data.item()))
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        b = time.time()
        print('current performance at epoch', epoch, "time:", b - a)

    if gpu == 0:
        print("Training complete in: " + str(datetime.now() - start))
Esempio n. 3
0
    word_counter = collections.Counter(words).most_common(vocab_size - 1)
    vocab = [w for w, _ in word_counter]
    w2i = dict((w, i) for i, w in enumerate(vocab, 1))
    w2i['<unk>'] = 0
    print('vocab_size', vocab_size)
    print('w2i size', len(w2i))

    data = [w2i[w] if w in w2i else 0 for w in words]
    data = create_batches(data, batch_size, seq_len)
    split_idx = int(len(data) * 0.8)
    training_data = data[:split_idx]
    test_data = data[split_idx:]
    print('train samples:', len(training_data))
    print('test samples:', len(test_data))

    model = GatedCNN(seq_len, vocab_size, embd_size, n_layers, kernel, out_chs,
                     res_block_count, vocab_size)
    cuda = None
    if torch.cuda.is_available():
        print("cuda")
        model.cuda()
        cuda = True
    else:
        cuda = False

    distributed_mode = False
    if distributed_mode:
        # sampler = DistributedSampler(training_data, num_replicas=world_size, rank=rank)
        if cuda:
            model = DataParallel(model)
        else:
            model = DistributedDataParallelCPU(model)
Esempio n. 4
0
    train_dataset, num_replicas=hvd.size(), rank=hvd.rank())
train_loader = torch.utils.data.DataLoader(train_dataset,
                                           batch_size=args.batch_size,
                                           sampler=train_sampler,
                                           **kwargs)

test_dataset = test_data
# Horovod: use DistributedSampler to partition the test data.
test_sampler = torch.utils.data.distributed.DistributedSampler(
    test_dataset, num_replicas=hvd.size(), rank=hvd.rank())
test_loader = torch.utils.data.DataLoader(test_dataset,
                                          batch_size=args.test_batch_size,
                                          sampler=test_sampler,
                                          **kwargs)

model = GatedCNN(seq_len, vocab_size, embd_size, n_layers, kernel, out_chs,
                 res_block_count, vocab_size)

if args.cuda:
    # Move model to GPU.
    model.cuda()

# Horovod: scale learning rate by the number of GPUs.
optimizer = optim.SGD(model.parameters(),
                      lr=args.lr * hvd.size(),
                      momentum=args.momentum)

# Horovod: broadcast parameters & optimizer state.
hvd.broadcast_parameters(model.state_dict(), root_rank=0)
hvd.broadcast_optimizer_state(optimizer, root_rank=0)

# Horovod: (optional) compression algorithm.
     cnn = TextCNN(
         sequence_length=x_train.shape[1],
         num_classes=len(y_train[1]),
         vocab_size=len(vocab_processor.vocabulary_),
         embedding_size=FLAGS.embedding_dim,
         filter_sizes=list(map(int, FLAGS.filter_sizes.split(","))),
         num_filters=FLAGS.num_filters,
         l2_reg_lambda=FLAGS.l2_reg_lambda,
         learning_rate = FLAGS.learning_rate)
 elif FLAGS.model == "gate_cnn":
     from gated_cnn import GatedCNN
     cnn = GatedCNN(
         sequence_length=x_train.shape[1],
         num_classes=len(y_train[1]),
         vocab_size=len(vocab_processor.vocabulary_),
         embedding_size=FLAGS.embedding_dim,
         filter_sizes=list(map(int, FLAGS.filter_sizes.split(","))),
         num_filters=FLAGS.num_filters,
         l2_reg_lambda=FLAGS.l2_reg_lambda,
         learning_rate = FLAGS.learning_rate)
 elif FLAGS.model == "gate_cnn_nopadding":
     from gated_cnn_nopadding import GatedCNN_nopadding
     cnn = GatedCNN_nopadding(
         sequence_length=x_train.shape[1],
         num_classes=len(y_train[1]),
         vocab_size=len(vocab_processor.vocabulary_),
         embedding_size=FLAGS.embedding_dim,
         filter_sizes=list(map(int, FLAGS.filter_sizes.split(","))),
         num_filters=FLAGS.num_filters,
         l2_reg_lambda=FLAGS.l2_reg_lambda,
         learning_rate = FLAGS.learning_rate)
Esempio n. 6
0
import torch
import torch.nn as nn

from utils import read_words, create_batches, to_var
from gated_cnn import GatedCNN

vocab_size = 2000
seq_len = 21
embd_size = 200
n_layers = 10
kernel = (5, embd_size)
out_chs = 64
res_block_count = 5
batch_size = 1024

model = GatedCNN(seq_len, vocab_size, embd_size, n_layers, kernel, out_chs,
                 res_block_count, vocab_size)
pytorch_total_params = sum(p.numel() for p in model.parameters())
print(pytorch_total_params)
words = read_words(
    '/users/PAS1588/liuluyu0378/lab1/1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled',
    seq_len, kernel[0])
word_counter = collections.Counter(words).most_common(vocab_size - 1)
vocab = [w for w, _ in word_counter]
w2i = dict((w, i) for i, w in enumerate(vocab, 1))
w2i['<unk>'] = 0
print('vocab_size', vocab_size)
print('w2i size', len(w2i))

data = [w2i[w] if w in w2i else 0 for w in words]
data = create_batches(data, batch_size, seq_len)
split_idx = int(len(data) * 0.8)