Example #1
0
 def setUp(self):
     test_path = os.path.dirname(os.path.realpath(__file__))
     src = SourceField()
     tgt = TargetField()
     self.dataset = torchtext.data.TabularDataset(
         path=os.path.join(test_path, 'data/eng-fra.txt'), format='tsv',
         fields=[('src', src), ('tgt', tgt)],
     )
     src.build_vocab(self.dataset)
     tgt.build_vocab(self.dataset)
Example #2
0
def make_datasets(train_df, dev_df):
    src = SourceField(tokenize=list)
    tgt = TargetField(tokenize=list)
    train = _prepare_dataset(
        train_df,
        (src, tgt)
    )
    dev = _prepare_dataset(
        dev_df,
        (src, tgt)
    )
    src.build_vocab(train)
    tgt.build_vocab(train)
    return train, dev, src, tgt
Example #3
0
    def setUpClass(self):
        test_path = os.path.dirname(os.path.realpath(__file__))
        src = SourceField()
        trg = TargetField()
        dataset = torchtext.data.TabularDataset(
            path=os.path.join(test_path, 'data/eng-fra.txt'), format='tsv',
            fields=[('src', src), ('trg', trg)],
        )
        src.build_vocab(dataset)
        trg.build_vocab(dataset)

        encoder = EncoderRNN(len(src.vocab), 10, 10, rnn_cell='lstm')
        decoder = DecoderRNN(len(trg.vocab), 10, 10, trg.sos_id, trg.eos_id, rnn_cell='lstm')
        seq2seq = Seq2seq(encoder, decoder)
        self.predictor = Predictor(seq2seq, src.vocab, trg.vocab)
    def setUp(self):
        test_path = os.path.dirname(os.path.realpath(__file__))
        src = SourceField()
        tgt = TargetField()
        self.dataset = torchtext.data.TabularDataset(
            path=os.path.join(test_path, 'data/eng-fra.txt'), format='tsv',
            fields=[('src', src), ('tgt', tgt)],
        )
        src.build_vocab(self.dataset)
        tgt.build_vocab(self.dataset)

        encoder = EncoderRNN(len(src.vocab), 10, 10, rnn_cell='lstm')
        decoder = DecoderRNN(len(tgt.vocab), 10, 10, tgt.sos_id, tgt.eos_id, rnn_cell='lstm')
        self.seq2seq = Seq2seq(encoder, decoder)

        for param in self.seq2seq.parameters():
            param.data.uniform_(-0.08, 0.08)
Example #5
0
def gen_data(train_path, dev_path):
    # Prepare dataset
    src = SourceField()
    tgt = TargetField()
    train = torchtext.data.TabularDataset(path=train_path,
                                          format='tsv',
                                          fields=[('src', src), ('tgt', tgt)],
                                          filter_pred=len_filter)
    dev = torchtext.data.TabularDataset(path=dev_path,
                                        format='tsv',
                                        fields=[('src', src), ('tgt', tgt)],
                                        filter_pred=len_filter)

    src.build_vocab(train, max_size=50000)
    tgt.build_vocab(train, max_size=50000)
    input_vocab = src.vocab
    output_vocab = tgt.vocab
    return train, dev, input_vocab, output_vocab
Example #6
0
    src = SourceField(init_token='<sos>', eos_token='<eos>')
    tgt = TargetField(
        init_token='<sos>',
        eos_token='<eos>')  # init_token='<sos>', eos_token='<eos>'
    train_data = torchtext.data.TabularDataset(
        # path='data/diffs/test/val.small.del_add.data', format='tsv',
        # path='data/splitted_two_input_100/train_100_10.data', format='tsv',
        path='data/splitted_two_input_100/train_100_47500.data',
        format='tsv',
        # path='data/splitted_two_input_200/train_200_83000.data', format='tsv',
        # path='../../../../new_data/processed_data/splitted_two_input_100/train_100.data', format='tsv',
        fields=[(src_del_field_name, src), (src_add_field_name, src),
                (tgt_field_name, tgt)],
    )

    src.build_vocab(train_data, max_size=50000)
    tgt.build_vocab(train_data, max_size=50000, min_freq=1)
    input_vocab = src.vocab
    output_vocab = tgt.vocab
    pad_index = output_vocab.stoi['<pad>']

    test_data = torchtext.data.TabularDataset(
        # path='data/diffs/test/val.small.del_add.data', format='tsv',
        path='data/splitted_two_input_100/test_100_19000.data',
        format='tsv',
        # path='data/splitted_two_input_200/test_200_17000.data', format='tsv',
        # path='data/splitted_two_input_100/test_100.data', format='tsv',
        # path='../../../../new_data/processed_data/splitted_two_input_100/train_100.data', format='tsv',
        fields=[(src_del_field_name, src), (src_add_field_name, src),
                (tgt_field_name, tgt)],
    )
    src.rebuild_vocab(input_vocab)
    tgt.rebuild_vocab(output_vocab)

    weight = torch.ones(len(tgt.vocab))
    pad = tgt.vocab.stoi[tgt.pad_token]
    loss = BLEULoss(weight, pad, tgt)
else:
    # Prepare dataset
    src = SourceField(sequential=True, use_vocab=True)
    tgt = TargetField(sequential=True, use_vocab=True)
    max_len = 23

    train = torchtext.data.TabularDataset(path=opt.train_path,
                                          format="tsv",
                                          fields=[("src", src), ("tgt", tgt)])
    src.build_vocab(train, vectors="glove.6B.100d", max_size=16384)
    tgt.build_vocab(train, vectors="glove.6B.100d", max_size=16384)
    input_vocab = src.vocab
    output_vocab = tgt.vocab

    # NOTE: If the source field name and the target field name
    # are different from 'src' and 'tgt' respectively, they have
    # to be set explicitly before any training or inference
    # seq2seq.src_field_name = 'src'
    # seq2seq.tgt_field_name = 'tgt'

    # Prepare loss
    weight = torch.ones(len(tgt.vocab))
    pad = tgt.vocab.stoi[tgt.pad_token]
    loss = BLEULoss(weight, pad, tgt)
    tgt = TargetField()
    max_len = 200

    def len_filter(example):
        return len(example.src) <= max_len and len(example.tgt) <= max_len

    train = torchtext.data.TabularDataset(path=opt.train_path,
                                          format='tsv',
                                          fields=[('src', src), ('tgt', tgt)],
                                          filter_pred=len_filter)
    dev = torchtext.data.TabularDataset(path=opt.dev_path,
                                        format='tsv',
                                        fields=[('src', src), ('tgt', tgt)],
                                        filter_pred=len_filter)

    vectors = torchtext.vocab.Vectors(
        name='../data/toy_reverse/train/falv.vector')

    src.build_vocab(train, max_size=200000, vectors=vectors)
    tgt.build_vocab(train, max_size=200000)

    input_vocab = src.vocab
    output_vocab = tgt.vocab

    # 通过pytorch创建的Embedding层
    embedding = nn.Embedding(2893, 400)
    # 指定嵌入矩阵的初始权重
    weight_matrix = src.vocab.vectors
    embedding.weight.data.copy_(weight_matrix)
    # 指定预训练权重的同时设定requires_grad=True
    # embeddings.weight = nn.Parameter(embeddings, requires_grad=True)
Example #9
0
    tgt = TargetField(stop_words=stopwords)
    max_len = 100

    def len_filter(example):
        return len(example.src) <= max_len and len(example.tgt) <= max_len
    train = torchtext.data.TabularDataset(
        path=opt.train_path, format='tsv',
        fields=[('src', src), ('tgt', tgt)],
        filter_pred=len_filter
    )
    dev = torchtext.data.TabularDataset(
        path=opt.dev_path, format='tsv',
        fields=[('src', src), ('tgt', tgt)],
        filter_pred=len_filter
    )
    src.build_vocab(train, max_size=50000, vectors=GloVe(name='6B', dim=300))
    tgt.build_vocab(train, max_size=50000, vectors=GloVe(name='6B', dim=300))
    input_vocab = src.vocab
    output_vocab = tgt.vocab
    print(len(train), len(dev))

    # NOTE: If the source field name and the target field name
    # are different from 'src' and 'tgt' respectively, they have
    # to be set explicitly before any training or inference
    # seq2seq.src_field_name = 'src'
    # seq2seq.tgt_field_name = 'tgt'

    # Prepare loss
    weight = torch.ones(len(tgt.vocab))
    pad = tgt.vocab.stoi[tgt.pad_token]
    loss = Perplexity(weight, pad)
Example #10
0
def train():
    src = SourceField(sequential=True,
                      tokenize=lambda x: [i for i in jieba.lcut(x)])
    tgt = TargetField(sequential=True,
                      tokenize=lambda x: [i for i in jieba.lcut(x)])
    max_len = 50

    def len_filter(example):
        return len(example.src) <= max_len and len(example.tgt) <= max_len

    train = torchtext.data.TabularDataset(path=opt.train_path,
                                          format='csv',
                                          fields=[('src', src), ('tgt', tgt)],
                                          filter_pred=len_filter)
    dev = torchtext.data.TabularDataset(path=opt.dev_path,
                                        format='csv',
                                        fields=[('src', src), ('tgt', tgt)],
                                        filter_pred=len_filter)

    src.build_vocab(train, max_size=50000)
    tgt.build_vocab(train, max_size=50000)
    input_vocab = src.vocab
    output_vocab = tgt.vocab

    # NOTE: If the source field name and the target field name
    # are different from 'src' and 'tgt' respectively, they have
    # to be set explicitly before any training or inference
    # seq2seq.src_field_name = 'src'
    # seq2seq.tgt_field_name = 'tgt'

    # Prepare loss
    weight = torch.ones(len(tgt.vocab))
    pad = tgt.vocab.stoi[tgt.pad_token]
    loss = Perplexity(weight, pad)
    if torch.cuda.is_available():
        loss.cuda()

    seq2seq = None
    optimizer = None
    if not opt.resume:
        # Initialize model
        hidden_size = 128
        bidirectional = True
        encoder = EncoderRNN(len(src.vocab),
                             max_len,
                             hidden_size,
                             bidirectional=bidirectional,
                             variable_lengths=True)
        decoder = DecoderRNN(len(tgt.vocab),
                             max_len,
                             hidden_size * 2 if bidirectional else hidden_size,
                             dropout_p=0.2,
                             use_attention=True,
                             bidirectional=bidirectional,
                             eos_id=tgt.eos_id,
                             sos_id=tgt.sos_id)
        seq2seq = Seq2seq(encoder, decoder)
        if torch.cuda.is_available():
            seq2seq.cuda()

        for param in seq2seq.parameters():
            param.data.uniform_(-0.08, 0.08)

        # Optimizer and learning rate scheduler can be customized by
        # explicitly constructing the objects and pass to the trainer.
        #
        # optimizer = Optimizer(torch.optim.Adam(seq2seq.parameters()), max_grad_norm=5)
        # scheduler = StepLR(optimizer.optimizer, 1)
        # optimizer.set_scheduler(scheduler)

    # train
    t = SupervisedTrainer(loss=loss,
                          batch_size=32,
                          checkpoint_every=50,
                          print_every=10,
                          expt_dir=opt.expt_dir)

    seq2seq = t.train(seq2seq,
                      train,
                      num_epochs=6,
                      dev_data=dev,
                      optimizer=optimizer,
                      teacher_forcing_ratio=0.5,
                      resume=opt.resume)
    predictor = Predictor(seq2seq, input_vocab, output_vocab)
    else:
        logging.info("loading checkpoint from {}".format(
            os.path.join(opt.expt_dir, Checkpoint.CHECKPOINT_DIR_NAME,
                         opt.load_checkpoint)))
        checkpoint_path = os.path.join(opt.expt_dir,
                                       Checkpoint.CHECKPOINT_DIR_NAME,
                                       opt.load_checkpoint)
        checkpoint = Checkpoint.load(checkpoint_path)
        seq2seq = checkpoint.model
        # input_vocab = checkpoint.input_vocab
        # output_vocab = checkpoint.output_vocab
        src.vocab = checkpoint.input_vocab
        tgt.vocab = checkpoint.output_vocab
else:
    src.build_vocab(train,
                    max_size=params['src_vocab_size'],
                    specials=replace_tokens)
    tgt.build_vocab(train, max_size=params['tgt_vocab_size'])
    # input_vocab = src.vocab
    # output_vocab = tgt.vocab

logging.info('Indices of special replace tokens:\n')
for rep in replace_tokens:
    logging.info("%s, %d; " % (rep, src.vocab.stoi[rep]))
logging.info('\n')

# Prepare loss
weight = torch.ones(len(tgt.vocab))
pad = tgt.vocab.stoi[tgt.pad_token]
loss = Perplexity(weight, pad)
if torch.cuda.is_available():
Example #12
0
    src = SourceField()
    tgt = TargetField()
    max_len = 20

    def len_filter(example):
        return len(example.src) <= max_len and len(example.tgt) <= max_len

    train = torchtext.data.TabularDataset(path=opt.train_path,
                                          format='tsv',
                                          fields=[('src', src), ('tgt', tgt)],
                                          filter_pred=len_filter)
    dev = torchtext.data.TabularDataset(path=opt.dev_path,
                                        format='tsv',
                                        fields=[('src', src), ('tgt', tgt)],
                                        filter_pred=len_filter)
    src.build_vocab(train, max_size=100000, min_freq=1)
    tgt.build_vocab(train, max_size=100000, min_freq=1)
    input_vocab = src.vocab
    output_vocab = tgt.vocab

    # NOTE: If the source field name and the target field name
    # are different from 'src' and 'tgt' respectively, they have
    # to be set explicitly before any training or inference
    # seq2seq.src_field_name = 'src'
    # seq2seq.tgt_field_name = 'tgt'

    # Prepare loss
    weight = torch.ones(len(tgt.vocab))
    pad = tgt.vocab.stoi[tgt.pad_token]
    loss = Perplexity(weight, pad)
    if torch.cuda.is_available():
Example #13
0
    src = SourceField()
    tgt = TargetField()
    max_len = params['max_len']

    def len_filter(example):
        return len(example.src) <= max_len and len(example.tgt) <= max_len

    train = torchtext.data.TabularDataset(path=opt.train_path,
                                          format='tsv',
                                          fields=[('src', src), ('tgt', tgt)],
                                          filter_pred=len_filter)
    dev = torchtext.data.TabularDataset(path=opt.dev_path,
                                        format='tsv',
                                        fields=[('src', src), ('tgt', tgt)],
                                        filter_pred=len_filter)
    src.build_vocab(train, max_size=params['src_vocab_size'])
    tgt.build_vocab(train, max_size=params['tgt_vocab_size'])
    input_vocab = src.vocab
    output_vocab = tgt.vocab

    # NOTE: If the source field name and the target field name
    # are different from 'src' and 'tgt' respectively, they have
    # to be set explicitly before any training or inference
    # seq2seq.src_field_name = 'src'
    # seq2seq.tgt_field_name = 'tgt'

    # Prepare loss
    weight = torch.ones(len(tgt.vocab))
    pad = tgt.vocab.stoi[tgt.pad_token]
    loss = Perplexity(weight, pad)
    if torch.cuda.is_available():
Example #14
0
    train = torchtext.data.TabularDataset(
        path=opt.train_path, format='tsv',
        fields=[('src', src), ('tgt', tgt)],
        filter_pred=len_filter
    )
    dev = torchtext.data.TabularDataset(
        path=opt.dev_path, format='tsv',
        fields=[('src', src), ('tgt', tgt)],
        filter_pred=len_filter
    )
    test = torchtext.data.TabularDataset(
        path=opt.test_path, format='tsv',
        fields=[('src', src), ('tgt', tgt)],
        filter_pred=len_filter
    )
    src.build_vocab(train, max_size=50000)
    tgt.build_vocab(train, max_size=50000)
    input_vocab = src.vocab
    output_vocab = tgt.vocab

    # inputs = torchtext.Field(lower=True, include_lengths=True, batch_first=True)
    # inputs.build_vocab(src.vocab)
    src.vocab.load_vectors(wv_type='glove.840B', wv_dim=300)

    # NOTE: If the source field name and the target field name
    # are different from 'src' and 'tgt' respectively, they have
    # to be set explicitly before any training or inference
    # seq2seq.src_field_name = 'src'
    # seq2seq.tgt_field_name = 'tgt'

    # Prepare loss
                                                    ('tgt', tgt)],
                                            train='test',
                                            validation='test',
                                            test='test')
adv_train, adv_dev, adv_test = Lang8.splits(exts=('.adv.cor', '.adv.err'),
                                            fields=[('src', src),
                                                    ('tgt', tgt)],
                                            train='test',
                                            validation='test',
                                            test='test')
adv_train_iter, adv_dev_iter, real_iter = torchtext.data.BucketIterator.splits(
    (adv_train, adv_dev, adv_train),
    batch_sizes=(1, 256, 256),
    device=device,
    sort_key=lambda x: len(x.src))
src.build_vocab(pre_train, pre_dev, pre_test, adv_train, adv_dev, adv_test)
tgt.build_vocab(pre_train, pre_dev, pre_test, adv_train, adv_dev, adv_test)
pad_id = tgt.vocab.stoi[tgt.pad_token]

# init generator
encoder = EncoderRNN(len(src.vocab),
                     max_len,
                     hidden_size,
                     bidirectional=bidirectional,
                     rnn_cell='lstm',
                     variable_lengths=True)
decoder = DecoderRNN(len(tgt.vocab),
                     max_len,
                     hidden_size * 2 if bidirectional else hidden_size,
                     dropout_p=0.2,
                     use_attention=True,
Example #16
0
    output_vocab = checkpoint.output_vocab
else:
    # Prepare dataset
    src = SourceField()
    tgt = TargetField()
    max_len = 50

    def len_filter(example):
        return len(example.src) <= max_len and len(example.tgt) <= max_len

    train = torchtext.data.TabularDataset(path=opt.train_path,
                                          format='tsv',
                                          fields=[('src', src), ('tgt', tgt)],
                                          filter_pred=len_filter)

    src.build_vocab(train, max_size=20000 - 2)
    tgt.build_vocab(train, max_size=20000 - 2)

    sos_id = tgt.vocab.stoi['<sos>']

    # Prepare loss
    weight = torch.ones(len(tgt.vocab))
    pad = tgt.vocab.stoi[tgt.pad_token]
    loss = Perplexity(weight, pad)
    if torch.cuda.is_available():
        loss.cuda()

    seq2seq = None
    optimizer = None
    if not opt.resume:
        # Initialize model
Example #17
0
    tgt = TargetField()
    max_len = 5

    def len_filter(example):
        return len(example.src) <= max_len and len(example.tgt) <= max_len

    train = torchtext.data.TabularDataset(path=opt.train_path,
                                          format='tsv',
                                          fields=[('src', src), ('tgt', tgt)],
                                          filter_pred=len_filter)
    dev = torchtext.data.TabularDataset(path=opt.dev_path,
                                        format='tsv',
                                        fields=[('src', src), ('tgt', tgt)],
                                        filter_pred=len_filter)
    src.build_vocab(train,
                    wv_type='glove.6B',
                    fill_from_vectors=True,
                    max_size=100000)
    tgt.build_vocab(train,
                    wv_type='glove.6B',
                    fill_from_vectors=True,
                    max_size=100000)
    input_vocab = src.vocab
    output_vocab = tgt.vocab

    # NOTE: If the source field name and the target field name
    # are different from 'src' and 'tgt' respectively, they have
    # to be set explicitly before any training or inference
    # seq2seq.src_field_name = 'src'
    # seq2seq.tgt_field_name = 'tgt'

    # Prepare loss
Example #18
0
def offline_training(opt, traget_file_path):

    # Prepare dataset with torchtext
    src = SourceField(tokenize=treebank_tokenizer)
    tgt = TargetField(tokenize=treebank_tokenizer)

    def sample_filter(sample):
        """ sample example for future purpose"""
        return True

    train = torchtext.data.TabularDataset(path=opt.train_path,
                                          format='tsv',
                                          fields=[('src', src), ('tgt', tgt)],
                                          filter_pred=sample_filter)
    dev = torchtext.data.TabularDataset(path=opt.dev_path,
                                        format='tsv',
                                        fields=[('src', src), ('tgt', tgt)],
                                        filter_pred=sample_filter)
    test = torchtext.data.TabularDataset(path=opt.dev_path,
                                         format='tsv',
                                         fields=[('src', src), ('tgt', tgt)],
                                         filter_pred=sample_filter)
    src.build_vocab(train, max_size=opt.src_vocab_size)
    tgt.build_vocab(train, max_size=opt.tgt_vocab_size)
    input_vocab = src.vocab
    output_vocab = tgt.vocab

    # NOTE: If the source field name and the target field name
    # are different from 'src' and 'tgt' respectively, they have
    # to be set explicitly before any training or inference
    # seq2seq.src_field_name = 'src'
    # seq2seq.tgt_field_name = 'tgt'

    # Prepare loss
    weight = torch.ones(len(tgt.vocab))
    pad = tgt.vocab.stoi[tgt.pad_token]
    if opt.loss == 'perplexity':
        loss = Perplexity(weight, pad)
    else:
        raise TypeError

    seq2seq = None
    optimizer = None
    if not opt.resume:
        # Initialize model
        encoder = EncoderRNN(vocab_size=len(src.vocab),
                             max_len=opt.max_length,
                             hidden_size=opt.hidden_size,
                             input_dropout_p=opt.intput_dropout_p,
                             dropout_p=opt.dropout_p,
                             n_layers=opt.n_layers,
                             bidirectional=opt.bidirectional,
                             rnn_cell=opt.rnn_cell,
                             variable_lengths=True,
                             embedding=input_vocab.vectors
                             if opt.use_pre_trained_embedding else None,
                             update_embedding=opt.update_embedding)
        decoder = DecoderRNN(vocab_size=len(tgt.vocab),
                             max_len=opt.max_length,
                             hidden_size=opt.hidden_size *
                             2 if opt.bidirectional else opt.hidden_size,
                             sos_id=tgt.sos_id,
                             eos_id=tgt.eos_id,
                             n_layers=opt.n_layers,
                             rnn_cell=opt.rnn_cell,
                             bidirectional=opt.bidirectional,
                             input_dropout_p=opt.input_dropout_p,
                             dropout_p=opt.dropout_p,
                             use_attention=opt.use_attention)
        seq2seq = Seq2seq(encoder=encoder, decoder=decoder)
        if opt.gpu >= 0 and torch.cuda.is_available():
            seq2seq.cuda()

        for param in seq2seq.parameters():
            param.data.uniform_(-0.08, 0.08)
    # train
    trainer = SupervisedTrainer(loss=loss,
                                batch_size=opt.batch_size,
                                checkpoint_every=opt.checkpoint_every,
                                print_every=opt.print_every,
                                expt_dir=opt.expt_dir)
    seq2seq = trainer.train(model=seq2seq,
                            data=train,
                            num_epochs=opt.epochs,
                            resume=opt.resume,
                            dev_data=dev,
                            optimizer=optimizer,
                            teacher_forcing_ratio=opt.teacher_forcing_rate)
Example #19
0
class auto_seq2seq:
    def __init__(self,
                 data_path,
                 model_save_path,
                 model_load_path,
                 hidden_size=32,
                 max_vocab=4000,
                 device='cuda'):
        self.src = SourceField()
        self.tgt = TargetField()
        self.max_length = 90
        self.data_path = data_path
        self.model_save_path = model_save_path
        self.model_load_path = model_load_path

        def len_filter(example):
            return len(example.src) <= self.max_length and len(
                example.tgt) <= self.max_length

        self.trainset = torchtext.data.TabularDataset(
            path=os.path.join(self.data_path, 'train'),
            format='tsv',
            fields=[('src', self.src), ('tgt', self.tgt)],
            filter_pred=len_filter)
        self.devset = torchtext.data.TabularDataset(path=os.path.join(
            self.data_path, 'eval'),
                                                    format='tsv',
                                                    fields=[('src', self.src),
                                                            ('tgt', self.tgt)],
                                                    filter_pred=len_filter)
        self.src.build_vocab(self.trainset, max_size=max_vocab)
        self.tgt.build_vocab(self.trainset, max_size=max_vocab)
        weight = torch.ones(len(self.tgt.vocab))
        pad = self.tgt.vocab.stoi[self.tgt.pad_token]
        self.loss = Perplexity(weight, pad)
        self.loss.cuda()
        self.optimizer = None
        self.hidden_size = hidden_size
        self.bidirectional = True
        encoder = EncoderRNN(len(self.src.vocab),
                             self.max_length,
                             self.hidden_size,
                             bidirectional=self.bidirectional,
                             variable_lengths=True)
        decoder = DecoderRNN(len(self.tgt.vocab),
                             self.max_length,
                             self.hidden_size *
                             2 if self.bidirectional else self.hidden_size,
                             dropout_p=0.2,
                             use_attention=True,
                             bidirectional=self.bidirectional,
                             eos_id=self.tgt.eos_id,
                             sos_id=self.tgt.sos_id)
        self.device = device
        self.seq2seq = Seq2seq(encoder, decoder).cuda()
        for param in self.seq2seq.parameters():
            param.data.uniform_(-0.08, 0.08)

    def train(self, epoch=20, resume=False):
        t = SupervisedTrainer(loss=self.loss,
                              batch_size=96,
                              checkpoint_every=1000,
                              print_every=1000,
                              expt_dir=self.model_save_path)
        self.seq2seq = t.train(self.seq2seq,
                               self.trainset,
                               num_epochs=epoch,
                               dev_data=self.devset,
                               optimizer=self.optimizer,
                               teacher_forcing_ratio=0.5,
                               resume=resume)
def train(opt):
    LOG_FORMAT = '%(asctime)s %(levelname)-8s %(message)s'
    logging.basicConfig(format=LOG_FORMAT,
                        level=getattr(logging, opt.log_level.upper()))
    logging.info(opt)
    if int(opt.GPU) >= 0:
        torch.cuda.set_device(int(opt.GPU))
    if opt.load_checkpoint is not None:
        logging.info("loading checkpoint from {}".format(
            os.path.join(opt.expt_dir, Checkpoint.CHECKPOINT_DIR_NAME,
                         opt.load_checkpoint)))
        checkpoint_path = os.path.join(opt.expt_dir,
                                       Checkpoint.CHECKPOINT_DIR_NAME,
                                       opt.load_checkpoint)
        checkpoint = Checkpoint.load(checkpoint_path)
        seq2tree = checkpoint.model
        input_vocab = checkpoint.input_vocab

    else:
        # Prepare dataset
        src = SourceField()
        nt = NTField()
        pos = PosField()
        tgt_tree = TreeField()
        comp = CompField()
        max_len = opt.max_len

        def len_filter(example):
            return len(example.src) <= max_len

        train = torchtext.data.TabularDataset(path=opt.train_path,
                                              format='tsv',
                                              fields=[('src', src), ('nt', nt),
                                                      ('pos', pos),
                                                      ('tree', tgt_tree)],
                                              filter_pred=len_filter)
        dev = torchtext.data.TabularDataset(path=opt.dev_path,
                                            format='tsv',
                                            fields=[('src', src), ('nt', nt),
                                                    ('pos', pos),
                                                    ('tree', tgt_tree)],
                                            filter_pred=len_filter)
        src.build_vocab(train, max_size=50000)
        comp.build_vocab(train, max_size=50000)
        nt.build_vocab(train, max_size=50000)
        pos.build_vocab(train, max_size=50000)
        # src_tree.build_vocab(train, max_size=50000)
        pos_in_nt = set()
        for Pos in pos.vocab.stoi:
            if nt.vocab.stoi[Pos] > 1:
                pos_in_nt.add(nt.vocab.stoi[Pos])
        hidden_size = opt.hidden_size
        input_vocab = src.vocab
        nt_vocab = nt.vocab

        def tree_to_id(tree):
            tree.set_label(nt_vocab.stoi[tree.label()])
            if len(tree) == 1 and str(tree[0])[0] is not '(':
                tree[0] = input_vocab.stoi[tree[0]]
                return
            else:
                for subtree in tree:
                    tree_to_id(subtree)
                tree.append(Tree(nt_vocab.stoi['<eos>'], []))
                return tree

        # train.examples = [str(tree_to_id(ex.tree)) for ex in train.examples]
        # dev.examples = [str(tree_to_id(ex.tree)) for ex in dev.examples]
        for ex in train.examples:
            ex.tree = str(tree_to_id(Tree.fromstring(ex.tree)))
        for ex in dev.examples:
            ex.tree = str(tree_to_id(Tree.fromstring(ex.tree)))
        # train.examples = [tree_to_id(Tree.fromstring(ex.tree)) for ex in train.examples]
        # dev.examples = [str(tree_to_id(Tree.fromstring(ex.tree))) for ex in dev.examples]
        if opt.word_embedding is not None:
            input_vocab.load_vectors([opt.word_embedding])

        loss = NLLLoss()
        if torch.cuda.is_available():
            loss.cuda()
        loss.reset()
        seq2tree = None
        optimizer = None
        if not opt.resume:
            # Initialize model
            bidirectional = opt.bidirectional_encoder
            encoder = EncoderRNN(len(src.vocab),
                                 opt.word_embedding_size,
                                 max_len,
                                 hidden_size,
                                 bidirectional=bidirectional,
                                 variable_lengths=True)
            decoder = DecoderTree(len(src.vocab),
                                  opt.word_embedding_size,
                                  opt.nt_embedding_size,
                                  len(nt.vocab),
                                  max_len,
                                  hidden_size *
                                  2 if bidirectional else hidden_size,
                                  sos_id=nt_vocab.stoi['<sos>'],
                                  eos_id=nt_vocab.stoi['<eos>'],
                                  dropout_p=0.2,
                                  use_attention=True,
                                  bidirectional=bidirectional,
                                  pos_in_nt=pos_in_nt)

            seq2tree = Seq2tree(encoder, decoder)
            if torch.cuda.is_available():
                seq2tree.cuda()

            for param in seq2tree.parameters():
                param.data.uniform_(-0.08, 0.08)
                # encoder.embedding.weight.data.set_(input_vocab.vectors)
                # encoder.embedding.weight.data.set_(output_vocab.vectors)

            # Optimizer and learning rate scheduler can be customized by
            # explicitly constructing the objects and pass to the trainer.
            #
            # optimizer = Optimizer(torch.optim.Adam(seq2seq.parameters()), max_grad_norm=5)
            # scheduler = StepLR(optimizer.optimizer, 1)
            # optimizer.set_scheduler(scheduler)

            optimizer = Optimizer(optim.Adam(seq2tree.parameters(), lr=opt.lr),
                                  max_grad_norm=5)
        # train
        t = SupervisedTrainer(loss=loss,
                              batch_size=opt.batch_size,
                              checkpoint_every=opt.checkpoint_every,
                              print_every=10,
                              expt_dir=opt.expt_dir,
                              lr=opt.lr)

        seq2tree = t.train(seq2tree,
                           train,
                           num_epochs=opt.epoch,
                           dev_data=dev,
                           optimizer=optimizer,
                           teacher_forcing_ratio=0,
                           resume=opt.resume)

    predictor = Predictor(seq2tree, input_vocab, nt_vocab)
    return predictor, dev, train
Example #21
0
def run_training(opt, default_data_dir, num_epochs=100):
    if opt.load_checkpoint is not None:
        logging.info("loading checkpoint from {}".format(
            os.path.join(opt.expt_dir, Checkpoint.CHECKPOINT_DIR_NAME, opt.load_checkpoint)))
        checkpoint_path = os.path.join(opt.expt_dir, Checkpoint.CHECKPOINT_DIR_NAME, opt.load_checkpoint)
        checkpoint = Checkpoint.load(checkpoint_path)
        seq2seq = checkpoint.model
        input_vocab = checkpoint.input_vocab
        output_vocab = checkpoint.output_vocab
    else:

        # Prepare dataset
        src = SourceField()
        tgt = TargetField()
        max_len = 50

        data_file = os.path.join(default_data_dir, opt.train_path, 'data.txt')

        logging.info("Starting new Training session on %s", data_file)

        def len_filter(example):
            return (len(example.src) <= max_len) and (len(example.tgt) <= max_len) \
                   and (len(example.src) > 0) and (len(example.tgt) > 0)

        train = torchtext.data.TabularDataset(
            path=data_file, format='json',
            fields={'src': ('src', src), 'tgt': ('tgt', tgt)},
            filter_pred=len_filter
        )

        dev = None
        if opt.no_dev is False:
            dev_data_file = os.path.join(default_data_dir, opt.train_path, 'dev-data.txt')
            dev = torchtext.data.TabularDataset(
                path=dev_data_file, format='json',
                fields={'src': ('src', src), 'tgt': ('tgt', tgt)},
                filter_pred=len_filter
            )

        src.build_vocab(train, max_size=50000)
        tgt.build_vocab(train, max_size=50000)
        input_vocab = src.vocab
        output_vocab = tgt.vocab

        # NOTE: If the source field name and the target field name
        # are different from 'src' and 'tgt' respectively, they have
        # to be set explicitly before any training or inference
        # seq2seq.src_field_name = 'src'
        # seq2seq.tgt_field_name = 'tgt'

        # Prepare loss
        weight = torch.ones(len(tgt.vocab))
        pad = tgt.vocab.stoi[tgt.pad_token]
        loss = Perplexity(weight, pad)
        if torch.cuda.is_available():
            logging.info("Yayyy We got CUDA!!!")
            loss.cuda()
        else:
            logging.info("No cuda available device found running on cpu")

        seq2seq = None
        optimizer = None
        if not opt.resume:
            hidden_size = 128
            decoder_hidden_size = hidden_size * 2
            logging.info("EncoderRNN Hidden Size: %s", hidden_size)
            logging.info("DecoderRNN Hidden Size: %s", decoder_hidden_size)
            bidirectional = True
            encoder = EncoderRNN(len(src.vocab), max_len, hidden_size,
                                 bidirectional=bidirectional,
                                 rnn_cell='lstm',
                                 variable_lengths=True)
            decoder = DecoderRNN(len(tgt.vocab), max_len, decoder_hidden_size,
                                 dropout_p=0, use_attention=True,
                                 bidirectional=bidirectional,
                                 rnn_cell='lstm',
                                 eos_id=tgt.eos_id, sos_id=tgt.sos_id)

            seq2seq = Seq2seq(encoder, decoder)
            if torch.cuda.is_available():
                seq2seq.cuda()

            for param in seq2seq.parameters():
                param.data.uniform_(-0.08, 0.08)

        # Optimizer and learning rate scheduler can be customized by
        # explicitly constructing the objects and pass to the trainer.

        optimizer = Optimizer(torch.optim.Adam(seq2seq.parameters()), max_grad_norm=5)
        scheduler = StepLR(optimizer.optimizer, 1)
        optimizer.set_scheduler(scheduler)

        # train

        num_epochs = num_epochs
        batch_size = 32
        checkpoint_every = num_epochs / 10
        print_every = num_epochs / 100

        properties = dict(batch_size=batch_size,
                          checkpoint_every=checkpoint_every,
                          print_every=print_every, expt_dir=opt.expt_dir,
                          num_epochs=num_epochs,
                          teacher_forcing_ratio=0.5,
                          resume=opt.resume)

        logging.info("Starting training with the following Properties %s", json.dumps(properties, indent=2))
        t = SupervisedTrainer(loss=loss, batch_size=num_epochs,
                              checkpoint_every=checkpoint_every,
                              print_every=print_every, expt_dir=opt.expt_dir)

        seq2seq = t.train(seq2seq, train,
                          num_epochs=num_epochs, dev_data=dev,
                          optimizer=optimizer,
                          teacher_forcing_ratio=0.5,
                          resume=opt.resume)

        evaluator = Evaluator(loss=loss, batch_size=batch_size)

        if opt.no_dev is False:
            dev_loss, accuracy = evaluator.evaluate(seq2seq, dev)
            logging.info("Dev Loss: %s", dev_loss)
            logging.info("Accuracy: %s", dev_loss)

    beam_search = Seq2seq(seq2seq.encoder, TopKDecoder(seq2seq.decoder, 4))

    predictor = Predictor(beam_search, input_vocab, output_vocab)
    while True:
        try:
            seq_str = raw_input("Type in a source sequence:")
            seq = seq_str.strip().split()
            results = predictor.predict_n(seq, n=3)
            for i, res in enumerate(results):
                print('option %s: %s\n', i + 1, res)
        except KeyboardInterrupt:
            logging.info("Bye Bye")
            exit(0)
Example #22
0
#################################################################################
# prepare model

if opt.load_checkpoint is not None:
    logging.info("loading checkpoint from {}".format(
        os.path.join(opt.output_dir, opt.load_checkpoint)))
    checkpoint_path = os.path.join(opt.output_dir, opt.load_checkpoint)
    checkpoint = Checkpoint.load(checkpoint_path)
    seq2seq = checkpoint.model
    input_vocab = checkpoint.input_vocab
    output_vocab = checkpoint.output_vocab
    src.vocab = input_vocab
    tgt.vocab = output_vocab
else:
    # build vocabulary
    src.build_vocab(train, max_size=opt.src_vocab)
    tgt.build_vocab(train, max_size=opt.tgt_vocab)
    input_vocab = src.vocab
    output_vocab = tgt.vocab

    # Initialize model
    hidden_size = opt.hidden_size
    decoder_hidden_size = hidden_size * 2 if opt.bidirectional else hidden_size
    encoder = EncoderRNN(len(src.vocab),
                         max_len,
                         hidden_size,
                         opt.embedding_size,
                         bidirectional=opt.bidirectional,
                         rnn_cell=opt.rnn_cell,
                         variable_lengths=True)
    decoder = DecoderRNN(len(tgt.vocab),
Example #23
0
    def len_filter(example):
        return len(example.src) <= max_len and len(example.tgt) <= max_len

    train = torchtext.data.TabularDataset(path=opt.train_path,
                                          format='tsv',
                                          fields=[('src', src), ('tgt', tgt)],
                                          filter_pred=len_filter)
    dev = torchtext.data.TabularDataset(path=opt.dev_path,
                                        format='tsv',
                                        fields=[('src', src), ('tgt', tgt)],
                                        filter_pred=len_filter)
    test = torchtext.data.TabularDataset(path=opt.test_path,
                                         format='tsv',
                                         fields=[('src', src), ('tgt', tgt)],
                                         filter_pred=len_filter)
    src.build_vocab(train, dev, test, max_size=50000)
    tgt.build_vocab(train, dev, test, max_size=1000)
    input_vocab = src.vocab
    output_vocab = tgt.vocab

    batch_iterator = torchtext.data.BucketIterator(
        dataset=train,
        batch_size=8,
        sort_key=lambda x: -len(x.src),
        repeat=False)
    for i, batch in enumerate(batch_iterator):
        input_variables, input_lengths = getattr(batch, 'src')
        target_variables = getattr(batch, 'tgt')
        print('Train inspection')
        for j, l in enumerate(input_lengths):
            src_indices = input_variables[j][0:l].data.tolist()
    max_len = int(myconf.model.maxlen)
    def len_filter(example):
        return len(example.src) <= max_len and len(example.tgt) <= max_len
    train = torchtext.data.TabularDataset(
        path=opt.train_path, format='tsv',
        fields=[('src', src), ('tgt', tgt)],
        filter_pred=len_filter
    )
    dev = torchtext.data.TabularDataset(
        path=opt.dev_path, format='tsv',
        fields=[('src', src), ('tgt', tgt)],
        filter_pred=len_filter
    )
    # 构建语料库的 Vocabulary,同时加载预训练的 word-embedding。通过 vocab.Vectors 使用自定义的 vectors。
    vectors = vocab.Vectors(myconf.embedding.char2vec)
    src.build_vocab(train, max_size=int(myconf.model.src_vocab_size))
    tgt.build_vocab(train, max_size=int(myconf.model.tgt_vocab_size))
    src.vocab.set_vectors(vectors.stoi, vectors.vectors, vectors.dim)
    input_vocab = src.vocab
    output_vocab = tgt.vocab

    # NOTE: If the source field name and the target field name
    # are different from 'src' and 'tgt' respectively, they have
    # to be set explicitly before any training or inference
    # seq2seq.src_field_name = 'src'
    # seq2seq.tgt_field_name = 'tgt'

    # Prepare loss
    weight = torch.ones(len(tgt.vocab))
    pad = tgt.vocab.stoi[tgt.pad_token]
    loss = Perplexity(weight, pad)