Esempio n. 1
0
def save_checkpoint(epoch, model, optimizer, model_path, dataset, use_cuda, emb_path, is_best):
    write_embedding(dataset.vocab.itos, model.criterion.W, use_cuda, emb_path + '.epoch_' + str(epoch + 1))
    torch.save(model.state_dict(), model_path + '.epoch_' + str(epoch + 1))
    torch.save(optimizer.state_dict(), model_path + '_optim' + '.epoch_' + str(epoch + 1))
    if is_best:
        write_embedding(dataset.vocab.itos, model.criterion.W, use_cuda, emb_path)
        torch.save(model.state_dict(), model_path)
        torch.save(optimizer.state_dict(), model_path + '_optim')
Esempio n. 2
0
def save_checkpoint(model, optimizer, model_path, dataset, use_cuda, emb_path):
    write_embedding(dataset.vocab.itos, model.criterion.W, use_cuda, emb_path)
    torch.save(model.state_dict(), model_path)
    torch.save(optimizer.state_dict(), model_path + '_optim')
Esempio n. 3
0
def train(train_path: str,
          emb_path: str,
          model_path: str,
          use_mlp=True,
          batch_size=64,
          epochs=3,
          maxlen=64,
          word_embed_size=200,
          hidden_size=200,
          learning_rate=0.0001,
          n_layers=1,
          min_freq=1,
          dropout=0.0,
          gpu_id=0):
    use_cuda = torch.cuda.is_available() and gpu_id > -1
    if use_cuda:
        device = torch.device('cuda:{}'.format(gpu_id))
        torch.cuda.set_device(gpu_id)
    else:
        device = torch.device('cpu')

    if not os.path.isfile(train_path):
        raise FileNotFoundError

    # make output dir
    output_dir = os.path.dirname(emb_path)
    if output_dir and not os.path.exists(output_dir):
        os.makedirs(output_dir)

    print('Loading input file')
    counter = 0
    with open(train_path, mode='r', encoding='utf-8') as f:
        for line in f:
            sentence = line.strip().lower().split()
            if 0 < len(sentence):
                counter += 1

    sentences = np.empty(counter, dtype=object)
    counter = 0
    with open(train_path, 'r', encoding='utf-8') as f:
        for line in f:
            sentence = line.strip().lower().split()
            if 0 < len(sentence):
                sentences[counter] = np.array(sentence[:maxlen])
                counter += 1

    print('Creating dataset, data size:', counter)
    dataset = Dataset(sentences, batch_size, min_freq, device)
    counter = np.array([dataset.vocab.freqs[word] if word in dataset.vocab.freqs else 0
                        for word in dataset.vocab.itos])
    model = Context2vec(vocab_size=len(dataset.vocab),
                        counter=counter,
                        word_embed_size=word_embed_size,
                        hidden_size=hidden_size,
                        n_layers=n_layers,
                        bidirectional=True,
                        use_mlp=use_mlp,
                        dropout=dropout,
                        pad_index=dataset.pad_index,
                        device=device,
                        inference=False).to(device)
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    print('batch_size:', batch_size, 'epochs:', epochs, 'word_embed_size:', word_embed_size, 'hidden_size:',
          hidden_size, 'device:', device)
    print('model:', model)

    # save model config
    output_config_file = model_path + '.config.json'
    write_config(output_config_file,
                 vocab_size=len(dataset.vocab),
                 word_embed_size=word_embed_size,
                 hidden_size=hidden_size,
                 n_layers=n_layers,
                 bidirectional=True,
                 use_mlp=use_mlp,
                 dropout=dropout,
                 pad_index=dataset.pad_index,
                 unk_token=dataset.unk_token,
                 bos_token=dataset.bos_token,
                 eos_token=dataset.eos_token,
                 learning_rate=learning_rate)

    interval = 1e6
    for epoch in range(epochs):
        begin_time = time.time()
        cur_at = begin_time
        total_loss = 0.0
        word_count = 0
        next_count = interval
        last_accum_loss = 0.0
        last_word_count = 0
        for iterator in dataset.get_batch_iter(batch_size):
            for batch in iterator:
                sentence = getattr(batch, 'sentence')
                target = sentence[:, 1:-1]
                if target.size(0) == 0:
                    continue
                optimizer.zero_grad()
                loss = model(sentence, target)
                loss.backward()
                optimizer.step()
                total_loss += loss.data.mean()

                minibatch_size, sentence_length = target.size()
                word_count += minibatch_size * sentence_length
                accum_mean_loss = float(total_loss) / word_count if total_loss > 0.0 else 0.0
                if word_count >= next_count:
                    now = time.time()
                    duration = now - cur_at
                    throuput = float((word_count - last_word_count)) / (now - cur_at)
                    cur_mean_loss = (float(total_loss) - last_accum_loss) / (word_count - last_word_count)
                    print('{} words, {:.2f} sec, {:.2f} words/sec, {:.4f} accum_loss/word, {:.4f} cur_loss/word'
                          .format(word_count, duration, throuput, accum_mean_loss, cur_mean_loss))
                    next_count += interval
                    cur_at = now
                    last_accum_loss = float(total_loss)
                    last_word_count = word_count

        print('epoch:[{}/{}], total_loss:[{}]'.format(epoch + 1, epochs, total_loss.item()))
        write_embedding(dataset.vocab.itos, model.criterion.W, use_cuda, emb_path + '.epoch_' + str(epoch + 1))
        torch.save(model, model_path + '.epoch_' + str(epoch + 1))