Beispiel #1
0
def make_test_data(config):
    path = parse_path(config['data_process']['base_path'])
    test_dataset = Seq2SeqDataset(path['processed']['test'])
    test_loader = DataLoader(
        dataset=test_dataset,
        batch_size=32,
        shuffle=False,
        num_workers=2
    )
    return test_loader
Beispiel #2
0
def make_train_data(config):
    path = parse_path(config['data_process']['base_path'])
    train_dataset = Seq2SeqDataset(path['processed']['train'])
    val_dataset = Seq2SeqDataset(path['processed']['val'])
    train_loader = DataLoader(
        dataset=train_dataset,
        batch_size=config['model'][config['model']['type']]['batch_size'],
        shuffle=True,
        num_workers=2
    )
    val_loader = DataLoader(
        dataset=val_dataset,
        batch_size=32,
        shuffle=False,
        num_workers=2
    )
    return train_loader, val_loader
Beispiel #3
0
def train(config):
    path = parse_path(config['data_process']['base_path'])
    model = make_model(config).cuda()
    train_loader, val_loader = make_train_data(config)
    if config['model']['share_src_trg_vocab']:
        with open(path['processed']['index2word'], 'rb') as handle:
            trg_index2word = pickle.load(handle)
    else:
        with open(path['processed']['trg_index2word'], 'rb') as handle:
            trg_index2word = pickle.load(handle)
    config = config['model'][config['model']['type']]
    criterion = SentenceCrossEntropy(label_smoothing=config['label_smoothing'])
    optimizer = optim.Adam(model.parameters(), lr=config['learning_rate'])
    for epoch in range(1, config['num_epoches'] + 1):
        sum_loss = 0
        sum_examples = 0
        s_loss = 0
        for i, data in enumerate(train_loader):
            src, trg = data
            src, trg = src.cuda(), trg.cuda()
            optimizer.zero_grad()
            logits = model(src, trg[:, 0:-1])
            loss = criterion(logits, trg[:, 1:])
            sum_loss += loss.item() * src.size(0)
            sum_examples += src.size(0)
            s_loss += loss.item()
            if i > 0 and i % 100 == 0:
                s_loss /= 100
                print('[epoch %2d] [step %4d] [loss %.4f]' %
                      (epoch, i, s_loss))
                s_loss = 0
            loss.backward()
            nn.utils.clip_grad_norm_(model.parameters(), config['clip'])
            optimizer.step()
        avg_loss = sum_loss / sum_examples
        val_loss = eval(val_loader, model, config['max_len'], criterion,
                        trg_index2word)
        print('[epoch %2d] [train loss %.4f] [val loss %.4f]' %
              (epoch, avg_loss, val_loss))
Beispiel #4
0
def make_recurrent_seq2seq(config):
    path = parse_path(config['data_process']['base_path'])
    data_log = yaml.load(open(path['log']['data_log']))
    share_src_trg_vocab = config['model']['share_src_trg_vocab']
    config = config['model'][config['model']['type']]
    if share_src_trg_vocab:
        src_embedding = nn.Embedding(
            num_embeddings=data_log['vocab_size'],
            embedding_dim=config['embed_size']
        )
        trg_embedding = src_embedding
    else:
        src_embedding = nn.Embedding(
            num_embeddings=data_log['src_vocab_size'],
            embedding_dim=config['embed_size']
        )
        trg_embedding = nn.Embedding(
            num_embeddings=data_log['trg_vocab_size'],
            embedding_dim=config['embed_size']
        )
    encoder = RecurrentEncoder(
        rnn_type=config['rnn_type'],
        embedding=src_embedding,
        hidden_size=config['hidden_size'],
        num_layers=config['num_layers'],
        bidirectional=config['bidirectional'],
        dropout=config['dropout']
    )
    decoder = RecurrentDecoder(
        rnn_type=config['rnn_type'],
        embedding=trg_embedding,
        hidden_size=config['hidden_size'],
        num_layers=config['num_layers'],
        dropout=config['dropout'],
        share_decoder_embedding=config['share_decoder_embedding']
    )
    model = RecurrentSeq2Seq(encoder, decoder)
    return model
def data_process(config):
    path = parse_path(config['data_process']['base_path'])
    src_train_word_lists = get_word_lists(
        path['raw']['src_train'], config['data_process']['src_clip_len'])
    trg_train_word_lists = get_word_lists(
        path['raw']['trg_train'], config['data_process']['trg_clip_len'])
    src_val_word_lists = get_word_lists(path['raw']['src_val'],
                                        config['data_process']['src_clip_len'])
    trg_val_word_lists = get_word_lists(path['raw']['trg_val'],
                                        config['data_process']['trg_clip_len'])
    src_test_word_lists = get_word_lists(
        path['raw']['src_test'], config['data_process']['src_clip_len'])
    trg_test_word_lists = get_word_lists(
        path['raw']['trg_test'], config['data_process']['trg_clip_len'])

    if config['model']['share_src_trg_vocab']:
        vocab = Vocab()

        for word_list in src_train_word_lists:
            vocab.add_list(word_list)
        for word_list in trg_train_word_lists:
            vocab.add_list(word_list)

        word2index, index2word = vocab.get_vocab(
            max_size=config['data_process']['vocab']['max_size'],
            min_freq=config['data_process']['vocab']['min_freq'])
        src_train = word_lists2numpy(src_train_word_lists, word2index)
        trg_train = word_lists2numpy(trg_train_word_lists, word2index)
        src_val = word_lists2numpy(src_val_word_lists, word2index)
        trg_val = word_lists2numpy(trg_val_word_lists, word2index)
        src_test = word_lists2numpy(src_test_word_lists, word2index)
        trg_test = word_lists2numpy(trg_test_word_lists, word2index)

        if not os.path.exists(os.path.dirname(path['processed']['train'])):
            os.makedirs(os.path.dirname(path['processed']['train']))

        np.savez(path['processed']['train'], src=src_train, trg=trg_train)
        np.savez(path['processed']['val'], src=src_val, trg=trg_val)
        np.savez(path['processed']['test'], src=src_test, trg=trg_test)

        with open(path['processed']['word2index'], 'wb') as handle:
            pickle.dump(word2index, handle)
        with open(path['processed']['index2word'], 'wb') as handle:
            pickle.dump(index2word, handle)
        data_log = {
            'vocab_size': len(index2word),
            'oov_size': len(word2index) - len(index2word),
            'src_train': analyze(src_train_word_lists),
            'trg_train': analyze(trg_train_word_lists),
            'src_val': analyze(src_val_word_lists),
            'trg_val': analyze(trg_val_word_lists),
            'src_test': analyze(src_test_word_lists),
            'trg_test': analyze(trg_test_word_lists)
        }
        if not os.path.exists(os.path.dirname(path['log']['data_log'])):
            os.makedirs(os.path.dirname(path['log']['data_log']))
        with open(path['log']['data_log'], 'w') as handle:
            yaml.safe_dump(data_log,
                           handle,
                           encoding='utf-8',
                           allow_unicode=True,
                           default_flow_style=False)
    else:
        src_vocab = Vocab()
        trg_vocab = Vocab()

        for word_list in src_train_word_lists:
            src_vocab.add_list(word_list)
        for word_list in trg_train_word_lists:
            trg_vocab.add_list(word_list)

        src_word2index, src_index2word = src_vocab.get_vocab(
            max_size=config['data_process']['vocab']['src']['max_size'],
            min_freq=config['data_process']['vocab']['src']['min_freq'])
        trg_word2index, trg_index2word = trg_vocab.get_vocab(
            max_size=config['data_process']['vocab']['trg']['max_size'],
            min_freq=config['data_process']['vocab']['trg']['min_freq'])

        src_train = word_lists2numpy(src_train_word_lists, src_word2index)
        trg_train = word_lists2numpy(trg_train_word_lists, trg_word2index)
        src_val = word_lists2numpy(src_val_word_lists, src_word2index)
        trg_val = word_lists2numpy(trg_val_word_lists, trg_word2index)
        src_test = word_lists2numpy(src_test_word_lists, src_word2index)
        trg_test = word_lists2numpy(trg_test_word_lists, trg_word2index)

        if not os.path.exists(os.path.dirname(path['processed']['train'])):
            os.makedirs(os.path.dirname(path['processed']['train']))

        np.savez(path['processed']['train'], src=src_train, trg=trg_train)
        np.savez(path['processed']['val'], src=src_val, trg=trg_val)
        np.savez(path['processed']['test'], src=src_test, trg=trg_test)

        with open(path['processed']['src_word2index'], 'wb') as handle:
            pickle.dump(src_word2index, handle)
        with open(path['processed']['src_index2word'], 'wb') as handle:
            pickle.dump(src_index2word, handle)
        with open(path['processed']['trg_word2index'], 'wb') as handle:
            pickle.dump(trg_word2index, handle)
        with open(path['processed']['trg_index2word'], 'wb') as handle:
            pickle.dump(trg_index2word, handle)
        data_log = {
            'src_vocab_size': len(src_index2word),
            'src_oov_size': len(src_word2index) - len(src_index2word),
            'trg_vocab_size': len(trg_index2word),
            'trg_oov_size': len(trg_word2index) - len(trg_index2word),
            'src_train': analyze(src_train_word_lists),
            'trg_train': analyze(trg_train_word_lists),
            'src_val': analyze(src_val_word_lists),
            'trg_val': analyze(trg_val_word_lists),
            'src_test': analyze(src_test_word_lists),
            'trg_test': analyze(trg_test_word_lists)
        }
        if not os.path.exists(os.path.dirname(path['log']['data_log'])):
            os.makedirs(os.path.dirname(path['log']['data_log']))
        with open(path['log']['data_log'], 'w') as handle:
            yaml.safe_dump(data_log,
                           handle,
                           encoding='utf-8',
                           allow_unicode=True,
                           default_flow_style=False)
Beispiel #6
0
def make_transformer(config):
    path = parse_path(config['data_process']['base_path'])
    data_log = yaml.load(open(path['log']['data_log']))
    share_src_trg_vocab = config['model']['share_src_trg_vocab']
    config = config['model'][config['model']['type']]
    if share_src_trg_vocab:
        src_embedding = nn.Embedding(
            num_embeddings=data_log['vocab_size'],
            embedding_dim=config['d_model']
        )
        trg_embedding = src_embedding
    else:
        src_embedding = nn.Embedding(
            num_embeddings=data_log['src_vocab_size'],
            embedding_dim=config['d_model']
        )
        trg_embedding = nn.Embedding(
            num_embeddings=data_log['trg_vocab_size'],
            embedding_dim=config['d_model']
        )
    positional_embedding = PositionalEmbedding(
        num_embeddings=config['num_positions'],
        embedding_dim=config['d_model'],
        learnable=False
    )
    scaled_dot_attention = ScaledDotAttention(dropout=0)
    multi_head_attention = MultiHeadAttention(
        attention=scaled_dot_attention,
        num_heads=config['num_heads'],
        hidden_size=config['d_model'],
        key_size=config['d_model'] // config['num_heads'],
        value_size=config['d_model'] // config['num_heads']
    )
    feed_forward = FeedForward(
        input_size=config['d_model'],
        feed_forward_size=4 * config['d_model'],
        output_size=config['d_model']
    )
    encoder_layer = TransformerEncoderLayer(
        hidden_size=config['d_model'],
        attention=deepcopy(multi_head_attention),
        feed_forward=deepcopy(feed_forward),
        dropout=config['dropout']
    )
    encoder = TransformerEncoder(
        embedding=src_embedding,
        positional_embedding=positional_embedding,
        layer=encoder_layer,
        num_layers=config['num_layers'],
        dropout=config['dropout']
    )
    decoder_layer = TransformerDecoderLayer(
        hidden_size=config['d_model'],
        self_attention=deepcopy(multi_head_attention),
        src_attention=deepcopy(multi_head_attention),
        feed_forward=deepcopy(feed_forward),
        dropout=config['dropout']
    )
    decoder = TransformerDecoder(
        embedding=trg_embedding,
        positional_embedding=positional_embedding,
        layer=decoder_layer,
        num_layers=config['num_layers'],
        dropout=config['dropout']
    )
    transformer = Transformer(
        encoder=encoder,
        decoder=decoder
    )
    return transformer
def make_conv_seq2seq(config):
    path = parse_path(config['data_process']['base_path'])
    data_log = yaml.load(open(path['log']['data_log']))
    share_src_trg_vocab = config['model']['share_src_trg_vocab']
    config = config['model'][config['model']['type']]
    if share_src_trg_vocab:
        src_embedding = nn.Embedding(
            num_embeddings=data_log['vocab_size'],
            embedding_dim=config['embed_size']
        )
        trg_embedding = src_embedding
    else:
        src_embedding = nn.Embedding(
            num_embeddings=data_log['src_vocab_size'],
            embedding_dim=config['embed_size']
        )
        trg_embedding = nn.Embedding(
            num_embeddings=data_log['trg_vocab_size'],
            embedding_dim=config['embed_size']
        )
    positional_embedding = PositionalEmbedding(
        num_embeddings=config['num_positions'],
        embedding_dim=config['embed_size']
    )
    if config['activate'] == 'glu':
        encoder_conv = ConvGLU(
            input_size=config['hidden_size'],
            output_size=config['hidden_size'],
            kernel_size=config['kernel_size'],
            encode=True
        )
        decoder_conv = ConvGLU(
            input_size=config['hidden_size'],
            output_size=config['hidden_size'],
            kernel_size=config['kernel_size'],
            encode=False
        )
    elif config['activate'] == 'relu':
        encoder_conv = ConvReLU(
            input_size=config['hidden_size'],
            output_size=config['hidden_size'],
            kernel_size=config['kernel_size'],
            encode=True
        )
        decoder_conv = ConvReLU(
            input_size=config['hidden_size'],
            output_size=config['hidden_size'],
            kernel_size=config['kernel_size'],
            encode=False
        )
    feed_forward = FeedForward(
        input_size=config['hidden_size'],
        feed_forward_size=4 * config['hidden_size'],
        output_size=config['hidden_size']
    )
    conv_encoder_layer = ConvEncoderLayer(
        hidden_size=config['hidden_size'],
        conv = encoder_conv,
        feed_forward=deepcopy(feed_forward),
        dropout=config['dropout']
    )
    conv_encoder = ConvEncoder(
        embedding=src_embedding,
        positional_embedding=positional_embedding,
        layer=conv_encoder_layer,
        num_layers=config['num_layers'],
        dropout=config['dropout']
    )
    dot_attention = DotAttention(dropout=0)
    conv_decoder_layer = ConvDecoderLayer(
        hidden_size=config['hidden_size'],
        embed_size=config['embed_size'],
        conv=decoder_conv,
        attention=dot_attention,
        feed_forward=feed_forward,
        dropout=config['dropout']
    )
    conv_decoder = ConvDecoder(
        embedding=trg_embedding,
        positional_embedding=positional_embedding,
        layer=conv_decoder_layer,
        num_layers=config['num_layers'],
        dropout=config['dropout']
    )
    conv_seq2seq = ConvSeq2Seq(
        encoder=conv_encoder,
        decoder=conv_decoder
    )
    return conv_seq2seq