Esempio n. 1
0

try:
    with open(args.vocab_file, 'r') as f:
        # We're using a pre-existing vocab file, so we shouldn't overwrite it
        args.predefined_vocab_flag = True
except FileNotFoundError:
    # We should create a new vocab file
    args.predefined_vocab_flag = False

corpus = data.SentenceCorpus(args.data_dir,
                             args.vocab_file,
                             args.test,
                             args.interact,
                             checkpoint_flag=args.load_checkpoint,
                             predefined_vocab_flag=args.predefined_vocab_flag,
                             collapse_nums_flag=args.collapse_nums_flag,
                             multisentence_test_flag=args.multisentence_test,
                             lower_flag=args.lowercase,
                             trainfname=args.trainfname,
                             validfname=args.validfname,
                             testfname=args.testfname)

if not args.interact:
    if args.test:
        if args.multisentence_test:
            test_data = [corpus.test]
        else:
            test_sents, test_data = corpus.test
    else:
        train_data = batchify(corpus.train, args.batch_size)
        val_data = batchify(corpus.valid, args.batch_size)
Esempio n. 2
0
    if args.cuda:
        model = torch.load(f).to(device)
    else:
        model = torch.load(f, map_location='cpu')
    # after load the rnn params are not a continuous chunk of memory
    # this makes them a continuous chunk, and will speed up forward pass
    if args.cuda and (not args.single) and (torch.cuda.device_count() > 1):
        model.module.rnn.flatten_parameters()
    else:
        if isinstance(model, torch.nn.DataParallel):
            model = model.module
        model.rnn.flatten_parameters()
model.eval()

corpus = data.SentenceCorpus(args.data_dir,
                             args.vocab_file,
                             generate_flag=True)

ntokens = len(corpus.dictionary)
if args.cuda and (not args.single) and (torch.cuda.device_count() > 1):
    hidden = model.module.init_hidden(1)
else:
    hidden = model.init_hidden(1)
input_sequence = torch.rand(1, 1).mul(ntokens).long()
if args.cuda:
    input_sequence.data = input_sequence.data.to(device)

with open(args.outf, 'w') as outf:
    for i in range(args.numwords):
        output, hidden = model(input_sequence, hidden)
        word_weights = output.squeeze().data.div(args.temperature).exp().cpu()
Esempio n. 3
0
    # Evenly divide the data across the bsz batches.
    data = data.view(bsz, -1).t().contiguous()
    # Turning the data over to CUDA at this point may lead to more OOM errors
    #if args.cuda:
    #    data = data.cuda()
    if isinstance(data, tuple):
        return data, tag_data
    return data


eval_batch_size = 10

corpus = data.SentenceCorpus(args.lm_data,
                             args.ccg_data,
                             args.save_lm_data,
                             args.test,
                             trainfname=args.trainfname,
                             validfname=args.validfname,
                             testfname=args.testfname)

if args.test:
    test_lm_sentences, test_lm_data = corpus.test_lm
    if args.ccg_data:
        test_ccg_sentences, test_ccg_data = corpus.test_ccg
    else:
        test_ccg_sentences = []
        test_ccg_data = []
else:
    train_lm_data = batchify(corpus.train_lm, args.batch_size)
    val_lm_data = batchify(corpus.valid_lm, eval_batch_size)
    train_ccg_data = batchify(corpus.train_ccg, args.batch_size)
Esempio n. 4
0
# These columns are treated as independent by the model, which means that the
# dependence of e. g. 'g' on 'f' can not be learned, but allows more efficient
# batch processing.

def batchify(data, bsz):
    # Work out how cleanly we can divide the dataset into bsz parts.
    nbatch = data.size(0) // bsz
    # Trim off any extra elements that wouldn't cleanly fit (remainders).
    data = data.narrow(0, 0, nbatch * bsz)
    # Evenly divide the data across the bsz batches.
    data = data.view(bsz, -1).t().contiguous()
    # Turning the data over to CUDA at this point may lead to more OOM errors
    return data.to(device)

corpus = data.SentenceCorpus(args.data_dir, args.vocab_file, args.test, args.interact,
                             trainfname=args.trainfname,
                             validfname=args.validfname,
                             testfname=args.testfname)

if not args.interact:
    if args.test:
        test_sents, test_data = corpus.test
    else:
        train_data = batchify(corpus.train, args.batch_size)
        val_data = batchify(corpus.valid, args.batch_size)

###############################################################################
# Build/load the model
###############################################################################

if not args.test and not args.interact:
    ntokens = len(corpus.dictionary)
Esempio n. 5
0
    parser.error("--temperature has to be greater or equal 1e-3")

with open(args.model_file, 'rb') as f:
    if args.cuda:
        model = torch.load(f).to(device)
    else:
        model = torch.load(f, map_location='cpu')
    # after load the rnn params are not a continuous chunk of memory
    # this makes them a continuous chunk, and will speed up forward pass
    if args.cuda and (not args.single) and (torch.cuda.device_count() > 1):
        model.module.rnn.flatten_parameters()
    else:
        model.rnn.flatten_parameters()
model.eval()

corpus = data.SentenceCorpus(args.data_dir, args.vocab_file, True)

ntokens = len(corpus.dictionary)
if args.cuda and (not args.single) and (torch.cuda.device_count() > 1):
    hidden = model.module.init_hidden(1)
else:
    hidden = model.init_hidden(1)
input = torch.tensor(torch.rand(1, 1).mul(ntokens).long())
if args.cuda:
    input.data = input.data.to(device)

with open(args.outf, 'w') as outf:
    for i in range(args.numwords):
        output, hidden = model(input, hidden)
        word_weights = output.squeeze().data.div(args.temperature).exp().cpu()
        word_idx = torch.multinomial(word_weights, 1)[0]
Esempio n. 6
0
            files = sorted(files)
            train_files = []
            valid_files = []
            for file in files:
                prefix = file.split('_')[0]
                if prefix == 'train':
                    train_files.append(file)
                if prefix == 'valid':
                    valid_files.append(file)
            print('Start training!!!')
            for epoch in range(1, args.epochs+1):
                valid_fname = random.choice(valid_files)
                for train_fname in train_files:
                    train_fname = random.choice(train_files)
                    corpus = data.SentenceCorpus(args.bptt, args.lm_data, args.tag_data, 
                                                 word2idx, tag2idx, idx2word, idx2tag,
                                                 train_fname, valid_fname, None, testflag=args.test)
    
                    train_lm_data = batchify(corpus.train_lm, args.batch_size)
                    train_masking = batchify(corpus.train_maksing, args.batch_size)
                    train_ccg_data = batchify(corpus.train_tag, args.batch_size)
                    
                    epoch_start_time = time.time()
                    train(args, model, train_lm_data, train_masking, train_ccg_data, criterion, optimizer)

                    val_lm_data = batchify(corpus.valid_lm, args.batch_size)
                    val_masking = batchify(corpus.valid_maksing, args.batch_size)
                    val_ccg_data = batchify(corpus.valid_tag, args.batch_size)
                    val_loss = evaluate(args, model, val_lm_data, val_masking, val_ccg_data)
                    print('-' * 80)
                    print('| end of {} | time: {:5.2f}s | valid loss {:5.4f} '.format(train_fname, 
Esempio n. 7
0
    else:
        torch.cuda.manual_seed(args.seed)

if args.temperature < 1e-3:
    parser.error("--temperature has to be greater or equal 1e-3")

with open(args.checkpoint, 'rb') as f:
    model = torch.load(f)
model.eval()

if args.cuda:
    model.cuda()
else:
    model.cpu()

corpus = data.SentenceCorpus(args.data, args.lm_data, True,
                             testfname=args.testfname)

ntokens = len(corpus.dictionary)
hidden = model.init_hidden(1)
input = Variable(torch.rand(1, 1).mul(ntokens).long(), volatile=True)
if args.cuda:
    input.data = input.data.cuda()

with open(args.outf, 'w') as outf:
    for i in range(args.words):
        output, hidden = model(input, hidden)
        word_weights = output.squeeze().data.div(args.temperature).exp().cpu()
        word_idx = torch.multinomial(word_weights, 1)[0]
        input.data.fill_(word_idx)
        word = corpus.dictionary.idx2word[word_idx]
Esempio n. 8
0
    else:
        torch.cuda.manual_seed(args.seed)

if args.temperature < 1e-3:
    parser.error("--temperature has to be greater or equal 1e-3")

with open(args.model_file, 'rb') as f:
    model = torch.load(f)
model.eval()

if args.cuda:
    model.cuda()
else:
    model.cpu()

corpus = data.SentenceCorpus(args.data_dir, args.vocab_file, True,
                             testfname=args.testfname)

ntokens = len(corpus.dictionary)
hidden = model.init_hidden(1)
input = Variable(torch.rand(1, 1).mul(ntokens).long(), volatile=True)
if args.cuda:
    input.data = input.data.cuda()

with open(args.outf, 'w') as outf:
    for i in range(args.numwords):
        output, hidden = model(input, hidden)
        word_weights = output.squeeze().data.div(args.temperature).exp().cpu()
        word_idx = torch.multinomial(word_weights, 1)[0]
        input.data.fill_(word_idx)
        word = corpus.dictionary.idx2word[word_idx]