Beispiel #1
0
def load_multi30k(config, device):
    csv_dir_path = get_or_create_dir('.data', 'multi30k')
    if not os.path.exists(f'{csv_dir_path}/train.csv'):
        source_field = torchtext.data.Field(tokenize=tokenize_de)
        target_field = torchtext.data.Field(tokenize=tokenize_en)
        torchtext.datasets.Multi30k.splits(exts=('.de', '.en'), fields=(source_field, target_field))
        create_multi30k()
    return load_from_csv(config, csv_dir_path, tokenize_de, tokenize_en, device)
def train(config, sample_validation_batches):
    source_language = config.get('src_language')
    target_language = config.get('trg_language')
    EOS_token = config.get('EOS_token')
    PAD_token = config.get('PAD_token')
    SOS_token = config.get('SOS_token')
    train_iter = config.get('train_iter')
    val_iter = config.get('val_iter')
    writer_path = config.get('writer_path')
    writer_train_path = get_or_create_dir(writer_path, 'train')
    writer_val_path = get_or_create_dir(writer_path, 'val')
    writer_train = SummaryWriter(log_dir=writer_train_path)
    writer_val = SummaryWriter(log_dir=writer_val_path)
    epochs = config.get('epochs')
    training = config.get('training')
    eval_every = training.get('eval_every')
    sample_every = training.get('sample_every')
    use_attention = config.get('use_attention')
    step = 1
    for epoch in range(epochs):
        print(f'Epoch: {epoch+1}/{epochs}')
        save_weights(config)
        for i, training_batch in enumerate(train_iter):
            loss = train_batch(config, training_batch)
            writer_train.add_scalar('loss', loss, step)

            if step == 1 or step % eval_every == 0:
                val_lengths = 0
                val_losses = 0
                reference_corpus = []
                translation_corpus = []
                for val_batch in val_iter:
                    val_loss, translations = evaluate_batch(config, val_batch)
                    val_lengths += 1
                    val_losses += val_loss
                    val_batch_trg, _ = val_batch.trg
                    _, batch_size = val_batch_trg.shape
                    references = map(
                        lambda i: torch2words(target_language,
                                              val_batch_trg[:, i]),
                        range(batch_size))
                    references = map(
                        lambda words: [
                            list(
                                filter_words(words, SOS_token, EOS_token,
                                             PAD_token))
                        ], references)
                    reference_corpus.extend(references)
                    translations = map(
                        lambda translation: list2words(
                            target_language, translation), translations)
                    translations = map(
                        lambda words: list(
                            filter_words(words, SOS_token, EOS_token, PAD_token
                                         )), translations)
                    translation_corpus.extend(translations)
                bleu = compute_bleu(reference_corpus, translation_corpus)
                val_loss = val_losses / val_lengths
                writer_val.add_scalar('bleu', bleu, step)
                writer_val.add_scalar('loss', val_loss, step)

            if step % sample_every == 0:
                val_batch = sample_validation_batches(1)
                val_batch_src, val_lengths_src = val_batch.src
                val_batch_trg, _ = val_batch.trg
                s0 = val_lengths_src[0].item()
                _, translations, attention_weights = evaluate_batch(
                    config, val_batch, True)
                source_words = torch2words(source_language, val_batch_src[:,
                                                                          0])
                target_words = torch2words(target_language, val_batch_trg[:,
                                                                          0])
                translation_words = list(
                    filter(lambda word: word != PAD_token,
                           list2words(target_language, translations[0])))
                if use_attention and sum(attention_weights.shape) != 0:
                    attention_figure = visualize_attention(
                        source_words[:s0], translation_words,
                        with_cpu(attention_weights))
                    writer_val.add_figure('attention', attention_figure, step)
                text = get_text(source_words, target_words, translation_words,
                                SOS_token, EOS_token, PAD_token)
                writer_val.add_text('translation', text, step)

            step += 1

    save_weights(config)
Beispiel #3
0
def load_dummy_variable_length(config, device):
    csv_dir_path = get_or_create_dir('.data', 'dummy_variable_length')
    if not os.path.exists(f'{csv_dir_path}/train.csv'):
        create_dummy_variable_length_csv()
    return load_from_csv(config, csv_dir_path, tokenize_dummy, tokenize_dummy, device)
Beispiel #4
0
def load_debug(config, device):
    csv_dir_path = get_or_create_dir('.data', 'debug')
    if not os.path.exists(f'{csv_dir_path}/train.csv'):
        create_debug_csv()
    return load_from_csv(config, csv_dir_path, tokenize_de, tokenize_en, device)