Esempio n. 1
0
    def __init__(self, args, train_loader, test_loader, tokenizer_src, tokenizer_tgt):
        self.args = args
        self.train_loader = train_loader
        self.test_loader = test_loader
        self.src_vocab_size = tokenizer_src.vocab_size
        self.tgt_vocab_size = tokenizer_tgt.vocab_size
        self.pad_id = tokenizer_src.pad_token_id # pad_token_id in tokenizer_tgt.vocab should be the same with this.
        self.device = 'cuda' if torch.cuda.is_available() and not args.no_cuda else 'cpu'

        self.model = Transformer(src_vocab_size = self.src_vocab_size,
                                 tgt_vocab_size = self.tgt_vocab_size,
                                 seq_len        = args.max_seq_len,
                                 d_model        = args.hidden,
                                 n_layers       = args.n_layers,
                                 n_heads        = args.n_attn_heads,
                                 p_drop         = args.dropout,
                                 d_ff           = args.ffn_hidden,
                                 pad_id         = self.pad_id)
        if args.multi_gpu:
            self.model = nn.DataParallel(self.model)
        self.model.to(self.device)

        self.optimizer = ScheduledOptim(optim.Adam(self.model.parameters(), betas=(0.9, 0.98), eps=1e-9),
                                        init_lr=2.0, d_model=args.hidden)
        self.criterion = nn.CrossEntropyLoss(ignore_index=self.pad_id)
Esempio n. 2
0
    def __init__(self, hparams, **kwargs):
        super(Transformer_pl, self).__init__()
        self.hparams = hparams
        self.transformer = Transformer(self.hparams)

        self.sp_kor = korean_tokenizer_load()
        self.sp_eng = english_tokenizer_load()
Esempio n. 3
0
 def build_train_model(self):
     self.train_mode = None
     print("# Select train mode [{}]".format("/".join([i[:3] for i in TRAIN_MODE_LIST])))
     for mode in TRAIN_MODE_LIST:
         if mode.startswith(self.hparams.train_mode):
             self.train_mode = mode
     assert self.train_mode
     
     self.data_loader = DataLoader(hparams = self.hparams, training = self.training, mode = self.train_mode)
     
     with tf.variable_scope('Network_Operator'):
         self.dataset_handler = tf.placeholder(tf.string, shape=[], name='dataset_handler')
         self.train_batch_iter = self.data_loader.get_training_batch(self.data_loader.train_dataset)
         self.test_batch_iter = self.data_loader.get_training_batch(self.data_loader.test_dataset)
         self.train_dataset_count, self.test_dataset_count = self.data_loader.train_dataset_count, self.data_loader.test_dataset_count
         input_batch = self.data_loader.multiple_batch(self.dataset_handler, self.train_batch_iter.batched_dataset)
     
     print("# Build model =", self.train_mode)
     self.model = Transformer(mode = self.train_mode,
                              graph = self.graph,
                              hparams = self.hparams,
                              data_loader = self.data_loader,
                              batch_input = input_batch)
     
     self.global_step = self.model.global_step
     self.epoch_num = self.model.train_epoch
Esempio n. 4
0
def train():
    inputs, src_vocab_size, tgt_vocab_size, idx2word = create_data()

    enc_inputs, dec_inputs, dec_outputs = make_data(*inputs)
    data_loader = Data.DataLoader(dataset=MyDataSet(enc_inputs, dec_inputs, dec_outputs),
                                  batch_size=2,
                                  shuffle=True)

    model = Transformer(src_vocab_size, tgt_vocab_size).cuda()
    criterion = nn.CrossEntropyLoss(ignore_index=0)  # PAD本身无意义,单词索引为0,设置ignore_index=0,可避免计算PAD的损失
    optimizer = optim.SGD(model.parameters(), lr=1e-3, momentum=0.09)

    for epoch in range(30):
        for enc_inputs, dec_inputs, dec_outputs in data_loader:
            """
            enc_inputs: [batch_size, src_len]
            dec_inputs: [batch_size, tgt_len]
            dec_outputs: [batch_size, tgt_len]
            """

            enc_inputs, dec_inputs, dec_outputs = enc_inputs.cuda(), dec_inputs.cuda(), dec_outputs.cuda()

            outputs, enc_self_attns, dec_self_attns, dec_enc_attns = model(enc_inputs, dec_inputs)
            loss = criterion(outputs, dec_outputs.view(-1))

            print('Epoch:', '%04d' % (epoch + 1), 'loss =', '{:.6f}'.format(loss))

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
Esempio n. 5
0
def main():
    dataset = Dataset(transform=transform, n_datas=10000,
                      seed=None)  #生成10000个数据,确保字符都出现
    model = Transformer(n_head=2)
    try:
        trained_epoch = sl.find_last_checkpoint('./checkpoint')
        print('load model %d' % (trained_epoch))
    except Exception as e:
        print('no trained model found, {}'.format(e))
        return
    model = sl.load_model('./checkpoint', -1, model)
    model.eval()

    x, y, extra = dataset.__getitem__(0)  #值使用y的第0个特征向量,即<pad>对应的onehot向量
    # print(x.shape, y.shape)
    # pred = model(torch.from_numpy(x).unsqueeze(0), torch.from_numpy(y).unsqueeze(0)).squeeze()
    pred = translate(model, x,
                     y[0])  #日期格式转换时,对于输入序列,我们全部知道;但是对于输出序列,只有开头的<pad>的已知的
    # print(pred.shape)
    pred = np.argmax(pred.detach().numpy(), axis=1)[1:]
    # print(extra['machine_readable'])
    pred = [dataset.inv_machine_vocab[p] for p in pred]
    pred_str = ''.join(pred)
    human_readable = extra['human_readable']
    machine_readable = extra['machine_readable']
    print('[%s] --> [%s], answer: [%s]' %
          (human_readable, pred_str, list(machine_readable)))

    dec_scores = model.decoder.scores_for_paint
    # print(dec_scores.shape)
    paint_score(dec_scores[0], human_readable, pred)  #[0]是去batch中的第0个
Esempio n. 6
0
def generate(
    x: str,
    beam_width: int,
    device: torch.device,
    max_seq_len: int,
    model: Transformer,
    tokenizer: Tokenizer
) -> str:
    model.eval()
    seq = torch.LongTensor([tokenizer.bos_id]).to(device)
    x = torch.LongTensor([tokenizer.encode(x, max_len=-1)]).to(device)

    accum_prob = torch.zeros(beam_width).to(device)

    for _ in range(max_seq_len):
        pred_y = model.predict(x, seq)

        top_k_in_all_beams = []
        for out_beams in range(seq.size(0)):
            top_k_prob_in_beam, top_k_index_in_beam = \
                pred_y[out_beams, -1].topk(
                    k=beam_width,
                    dim=-1
                )
            for in_beam in range(beam_width):

                prob = accum_prob[out_beams] -\
                    top_k_prob_in_beam[in_beam].log()
                prob = prob.unsqueeze(0)

                temp_seq = torch.cat([
                    seq[out_beams],
                    top_k_index_in_beam[in_beam].unsqueeze(0)
                ], dim=-1).unsqueeze(0)

                top_k_in_all_beams.append({
                    'prob': prob,
                    'seq': temp_seq
                })

        _, top_k_index_in_all_beams = torch.cat([
            beam['prob'] for beam in top_k_in_all_beams
        ]).topk(k=beam_width, dim=0)

        seq = torch.cat([
            top_k_in_all_beams[index]['seq']
            for index in top_k_index_in_all_beams
        ], dim=0)

        accum_prob = torch.cat([
            top_k_in_all_beams[index]['prob']
            for index in top_k_index_in_all_beams
        ], dim=0)

        if x.size(0) != seq.size(0):
            x = x.repeat(seq.size(0) // x.size(0), 1)

    for i in tokenizer.batch_decode(seq.tolist()):
        print(i)
Esempio n. 7
0
def main():
    device = config.device

    p = Preprocess("data/europarl-v7.fr-en.en", "data/europarl-v7.fr-en.fr")

    transformer = Transformer(p.src_word2ind, p.trg_word2ind)
    transformer.to(device)

    train(p, transformer)
Esempio n. 8
0
    def test_transformer_with_convolution(self):
        train_dataset = StocksDataset(files=FILES[:10], min_length=30)
        train_dataloader = DataLoader(train_dataset,
                                      batch_size=2,
                                      shuffle=False)

        model = Transformer(use_convolutions=True).double()
        for batch in train_dataloader:
            model.training_step(batch.double(), 0)
            break
Esempio n. 9
0
    def test_vanilla_transformer(self):

        train_dataset = StocksDataset(files=FILES[:10], min_length=30)
        train_dataloader = DataLoader(train_dataset,
                                      batch_size=2,
                                      shuffle=False)

        model = Transformer().double()
        for batch in train_dataloader:
            model.training_step(batch.double(), 0)
            break
Esempio n. 10
0
 def build_model(self, args):
     encoder_embed_tokens = nn.Embedding(
         self.src_dict.token_num,
         args.encoder_embed_dim,
         padding_idx=self.src_dict.padding_idx)
     if args.share_all_embeddings:
         decoder_embed_tokens = encoder_embed_tokens
     else:
         decoder_embed_tokens = nn.Embedding(
             self.trg_dict.token_num,
             args.decoder_embed_dim,
             padding_idx=self.trg_dict.padding_idx)
     self.model = Transformer(args, self.src_dict, self.trg_dict)
Esempio n. 11
0
    def init_from_config(self, config):
        # self.model = Model(config)
        self.model = Transformer(config, config.test.devices)
        self.model.build_test_model()

        sess_config = tf.ConfigProto()
        sess_config.gpu_options.allow_growth = True
        sess_config.allow_soft_placement = True
        self.sess = tf.Session(config=sess_config, graph=self.model.graph)
        # Restore model.
        self.model.saver.restore(self.sess, tf.train.latest_checkpoint(config.train.logdir))

        self.data_reader = DataReader(config)
Esempio n. 12
0
def gen_soft_labels(c):
    c.setdefault(hebbian=False, distributed=False)
    net = Transformer(c)
    net, step = c.init_model(net, step='max', train=False)

    print('generating soft labels...')
    data_gen_tr = SequentialIterator(c, 1, 'train')
    net.eval()
    with torch.no_grad():
        i = 0
        for batch in tqdm(data_gen_tr):
            x = to_torch(batch, c.device).t()
            inputs, labels = x[:-1], x[1:]
            probs, _ = net(inputs, labels)

            values, indices = torch.topk(probs, c.topk, dim=1)

            indices_ = indices.cpu().numpy()
            values_ = values.cpu().numpy()
            labels_ = labels.cpu().numpy()

            if probs.size(0) != inputs.size(0):
                indices_ = indices_[-inputs.size(0):, :]
                values_ = values_[-inputs.size(0):, :]

            if i == 0:
                all_soft_indices = indices_
                all_soft_values = values_
            else:
                all_soft_indices = np.concatenate((all_soft_indices, indices_),
                                                  axis=0)
                all_soft_values = np.concatenate((all_soft_values, values_),
                                                 axis=0)

            i += 1
    all_soft_indices = np.concatenate(
        (all_soft_indices[0:1, :], all_soft_indices), axis=0)
    all_soft_values = np.concatenate(
        (all_soft_values[0:1, :], all_soft_values), axis=0)

    np.save(Cache / 'wikitext-103' / 'train_soft_labels.npy', all_soft_indices)
    np.save(Cache / 'wikitext-103' / 'train_soft_probs.npy', all_soft_values)
    print('Saved %s' % (Cache / 'wikitext-103' / 'train_soft_labels.npy'))
    print('Saved %s' % (Cache / 'wikitext-103' / 'train_soft_probs.npy'))

    cnt = 0.
    for k in range(len(data_gen_tr.tokens)):
        if data_gen_tr.tokens[k] in all_soft_indices[k]:
            cnt += 1
    print('%s%% of the tokens are predicted within the top %s logits' %
          (100 * cnt / len(data_gen_tr.tokens), c.topk))
Esempio n. 13
0
    def __init__(self, model_source, rewrite_len=30, beam_size=4, debug=False):
        self.beam_size = beam_size
        self.rewrite_len = rewrite_len
        self.debug = debug

        model_source = torch.load(model_source,
                                  map_location=lambda storage, loc: storage)
        self.dict = model_source["word2idx"]
        self.idx2word = {v: k for k, v in model_source["word2idx"].items()}
        self.args = args = model_source["settings"]
        torch.manual_seed(args.seed)
        model = Transformer(args)
        model.load_state_dict(model_source['model'])
        self.model = model.eval()
Esempio n. 14
0
 def build_predict_model(self):
     self.src_placeholder = tf.placeholder(shape=[None], dtype=tf.string, name = 'Inputs')
     self.src_length_placeholder = tf.placeholder(shape=[None], dtype=tf.int32, name = 'Inputs_length')
     src_dataset = tf.data.Dataset.from_tensor_slices((self.src_placeholder, self.src_length_placeholder))
     self.infer_batch = self.data_loader.get_inference_batch(src_dataset)
     print("# Build inference model ...")
     self.model = Transformer(mode = 'inference',
                              graph = self.graph,
                              hparams = self.hparams,
                              data_loader = self.data_loader,
                              batch_input = self.infer_batch)
     print("# Restoring model weights ...")
     self.saver, self.restore = variable_loader(self.session, RESULT_DIR)
     assert self.restore
     self.session.run(tf.tables_initializer())
Esempio n. 15
0
def main():
    global D_MODEL, N_LAYERS, N_HEADS, DROPOUT, N_EPOCHS, B_SIZE, LR

    D_MODEL = args.modeldim
    N_LAYERS = args.nlayers
    N_HEADS = args.nheads
    DROPOUT = args.dropout
    N_EPOCHS = args.epochs
    B_SIZE = args.batchsize
    LR = args.lr

    train_iter, val_iter, TEXT, LABEL = get_dataiter(args.datapath,
                                                     batch_size=B_SIZE)

    if args.predict:
        model = Transformer(len(TEXT.vocab),
                            len(LABEL.vocab),
                            D_MODEL,
                            N_LAYERS,
                            N_HEADS,
                            dropout=DROPOUT)
        model = torch.load(args.predmodel, map_location=torch.device('cpu'))
        predict(model, args.predict, TEXT, LABEL, custom_sent=True)
        exit(0)

    print(
        f'Training start time: {datetime.now().strftime("%d/%m/%Y %H:%M:%S")}')
    if args.linear:
        el_train(train_iter, val_iter, TEXT, LABEL)
    else:
        ed_train(train_iter, val_iter, TEXT, LABEL)
    print(
        f'Training completion time: {datetime.now().strftime("%d/%m/%Y %H:%M:%S")}'
    )
Esempio n. 16
0
class Translation(object):
    def __init__(self, args):
        super(Translation, self).__init__()
        self.datasets = {}
        self.data_dir = args.data_dir

        self.src_lang, self.trg_lang = dataset_utils.infer_language_pair(
            args.data_dir)

        src_dict_path = os.path.join(args.data_dir,
                                     dict_path.format(self.src_lang))
        trg_dict_path = os.path.join(args.data_dir,
                                     dict_path.format(self.trg_lang))
        self.src_dict = Dictionary.build_from_dict_file(src_dict_path)
        self.trg_dict = Dictionary.build_from_dict_file(trg_dict_path)

        self.model = None
        self.criterion = None
        self.optimizer = None

    def load_dataset(self, split):
        # 根据split找到路径
        src_split_path = os.path.join(
            self.data_dir,
            subset_path.format(split, self.src_lang, self.trg_lang,
                               self.src_lang))
        trg_split_path = os.path.join(
            self.data_dir,
            subset_path.format(split, self.src_lang, self.trg_lang,
                               self.trg_lang))

        src_dataset = SingleDataset(src_split_path)
        trg_dataset = SingleDataset(trg_split_path)
        pair_dataset = PairDataset(src_dataset, trg_dataset)
        self.datasets[split] = pair_dataset

    def build_model(self, args):
        encoder_embed_tokens = nn.Embedding(
            self.src_dict.token_num,
            args.encoder_embed_dim,
            padding_idx=self.src_dict.padding_idx)
        if args.share_all_embeddings:
            decoder_embed_tokens = encoder_embed_tokens
        else:
            decoder_embed_tokens = nn.Embedding(
                self.trg_dict.token_num,
                args.decoder_embed_dim,
                padding_idx=self.trg_dict.padding_idx)
        self.model = Transformer(args, self.src_dict, self.trg_dict)

    def build_criterion(self, label_smooth):
        self.criterion = LabelSmoothedCrossEntropyCriterion(label_smooth)

    def build_optimizer(self):
        if self.model is None:
            print("should build model first!")
        else:
            self.optimizer = CustomAdam(self.model.parameters(),
                                        lr=self.args.lr,
                                        betas=self.args.betas)
Esempio n. 17
0
    def instantiate_model(self,
                          english_vocab_size,
                          norwegian_vocab_size,
                          embedding_dim=256,
                          num_heads=8,
                          num_encoders=6,
                          ff_dim=256):
        model = Transformer(english_vocab_size, norwegian_vocab_size,
                            embedding_dim, num_heads, num_encoders, ff_dim,
                            self.cuda).to(self.cuda)

        for p in model.parameters():
            if p.dim() > 1:
                torch.nn.init.xavier_uniform(p)

        return model
Esempio n. 18
0
def model_testing(test_dataset, parameters):
    loc_to = '/home/preetham/Documents/Preetham/masters-thesis/results/gloss-to-grapheme/transformer/'
    global val_loss, val_accuracy, loss_object, transformer
    val_loss = tf.keras.metrics.Mean(name='val_loss')
    val_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(
        name='val_accuracy')
    val_loss.reset_states()
    val_accuracy.reset_states()
    checkpoint_dir = loc_to + 'model_' + str(
        parameters['model']) + '/training_checkpoints'
    if parameters['n_layers'] <= 6:
        n_layers = parameters['n_layers']
    else:
        n_layers = parameters['n_layers'] - 6
    transformer = Transformer(n_layers,
                              parameters['d_model'],
                              parameters['n_heads'],
                              parameters['dff'],
                              parameters['inp_vocab_size'],
                              parameters['tar_vocab_size'],
                              pe_input=parameters['inp_vocab_size'],
                              pe_target=parameters['tar_vocab_size'],
                              rate=parameters['dropout'])
    checkpoint = tf.train.Checkpoint(transformer=transformer)
    loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
        from_logits=True, reduction='none')
    checkpoint.restore(tf.train.latest_checkpoint(checkpoint_dir))
    for (batch,
         (inp, tar)) in enumerate(test_dataset.take(parameters['test_steps'])):
        val_step(inp, tar)
    print('Test Loss=', round(val_loss.result().numpy(), 3))
    print('Test Accuracy=', round(val_accuracy.result().numpy(), 3))
    print()
Esempio n. 19
0
def greedy_test(args):
    """ Test function """

    # load vocabulary
    vocab = torch.load(args.vocab)

    # build model
    translator = Transformer(args, vocab)
    translator.eval()

    # load parameters
    translator.load_state_dict(torch.load(args.decode_model_path))
    if args.cuda:
        translator = translator.cuda()

    test_data = read_corpus(args.decode_from_file, source="src")
    # ['<BOS>', '<PAD>', 'PAD', '<PAD>', '<PAD>']
    pred_data = len(test_data) * [[
        constants.PAD_WORD if i else constants.BOS_WORD
        for i in range(args.decode_max_steps)
    ]]

    output_file = codecs.open(args.decode_output_file, "w", encoding="utf-8")
    for test, pred in zip(test_data, pred_data):
        pred_output = [constants.PAD_WORD] * args.decode_max_steps
        test_var = to_input_variable([test], vocab.src, cuda=args.cuda)

        # only need one time
        enc_output = translator.encode(test_var[0], test_var[1])
        for i in range(args.decode_max_steps):
            pred_var = to_input_variable([pred[:i + 1]],
                                         vocab.tgt,
                                         cuda=args.cuda)

            scores = translator.translate(enc_output, test_var[0], pred_var)

            _, argmax_idxs = torch.max(scores, dim=-1)
            one_step_idx = argmax_idxs[-1].item()

            pred_output[i] = vocab.tgt.id2word[one_step_idx]
            if (one_step_idx
                    == constants.EOS) or (i == args.decode_max_steps - 1):
                print("[Source] %s" % " ".join(test))
                print("[Predict] %s" % " ".join(pred_output[:i]))
                print()

                output_file.write(" ".join(pred_output[:i]) + "\n")
                output_file.flush()
                break
            pred[i + 1] = vocab.tgt.id2word[one_step_idx]

    output_file.close()
Esempio n. 20
0
def test(hp):
    # Loading hyper params
    load_hparams(hp, hp.ckpt)

    logging.info("# Prepare test batches")
    test_batches, num_test_batches, num_test_samples = get_batch(
        hp.test1,
        hp.test1,
        100000,
        100000,
        hp.vocab,
        hp.test_batch_size,
        shuffle=False)
    iter = tf.data.Iterator.from_structure(test_batches.output_types,
                                           test_batches.output_shapes)
    xs, ys = iter.get_next()

    test_init_op = iter.make_initializer(test_batches)

    logging.info("# Load model")
    model = Transformer(hp)

    logging.info("# Session")
    with tf.Session() as sess:
        ckpt_ = tf.train.latest_checkpoint(hp.ckpt)
        ckpt = ckpt_ if ckpt_ else hp.ckpt
        saver = tf.train.Saver()

        saver.restore(sess, ckpt)

        y_hat, mean_loss = model.eval(sess, test_init_op, xs, ys,
                                      num_test_batches)

        logging.info("# get hypotheses")
        hypotheses = get_hypotheses(num_test_samples, y_hat, model.idx2token)

        logging.info("# write results")
        model_output = os.path.split(ckpt)[-1]
        if not os.path.exists(hp.testdir):
            os.makedirs(hp.testdir)
        translation = os.path.join(hp.testdir, model_output)
        with open(translation, 'w', encoding="utf-8") as fout:
            fout.write("\n".join(hypotheses))

        logging.info("# calc bleu score and append it to translation")
        calc_bleu_nltk(hp.test2, translation)
Esempio n. 21
0
def my_model_fn(features, labels, mode, params):
    warmup_steps = min(params['warmup_steps'], params['train_steps'] * 0.1)
    config = params['config']
    x, y = features
    y_label = labels
    if FLAGS.model_type == 'transformer':
        transformer = Transformer(config=config, mode=mode)
    else:
        transformer = RNNTransformer(config=config, mode=mode)
    logits, predicts = transformer.create_model(x_input=x, y_input=y)
    loss = transformer.calculate_loss(logits=logits, y_labels=y_label)

    for v in tf.trainable_variables():
        tf.logging.info(v.name)

    if mode == tf.estimator.ModeKeys.TRAIN:
        '''
        训练rnn 模型的时候推荐的方法
        '''
        train_op, learning_rate = create_train_opt_with_clip(loss=loss,
                                                             step_num_in_epoch=params['train_steps'] / params[
                                                                 'num_epoches'])
        hook_dict = {
            'loss': loss,
            'learning_rate': learning_rate,
        }
        hook = tf.train.LoggingTensorHook(
            hook_dict,
            every_n_iter=10
        )
        return tf.estimator.EstimatorSpec(
            mode=mode,
            training_hooks=[hook],
            loss=loss,
            train_op=train_op)

    elif mode == tf.estimator.ModeKeys.PREDICT:

        return tf.estimator.EstimatorSpec(
            mode=mode,
            predictions={'prediction': predicts}
        )

    else:

        raise NotImplementedError('not implemented')
Esempio n. 22
0
def padding_for_trs(batch):
    items = zip(*batch)
    padded_src, padded_trg, src_pos, trg_pos = list(
        map(lambda x: torch.nn.utils.rnn.pad_sequence(x, padding_value=C.PAD),
            items))
    trg_mask, src_key_padding_mask, trg_key_padding_mask, memory_key_padding_mask = Transformer.get_masks(
        padded_src, padded_trg[:-1], PAD=C.PAD)

    return padded_src, padded_trg, src_pos, trg_pos, trg_mask, src_key_padding_mask, trg_key_padding_mask, memory_key_padding_mask
    def __init__(self, model_dir, vocab_file):
        """
        :param model_dir: model dir path
        :param vocab_file: vocab file path
        """
        self.tf = import_tf(0)

        self.model_dir = model_dir
        self.vocab_file = vocab_file
        self.token2idx, self.idx2token = _load_vocab(vocab_file)

        hparams = Hparams()
        parser = hparams.parser
        self.hp = parser.parse_args()

        self.model = Transformer(self.hp)

        self._add_placeholder()
        self._init_graph()
Esempio n. 24
0
    def __init__(self, cfg):
        super(LightningTransformer, self).__init__()

        self.model_cfg = cfg.model
        self.data_cfg = cfg.data
        self.train_cfg = cfg.train_cfg
        self.lr_cfg = cfg.lr_cfg
        self._update_model_cfg_by_data()

        self.transformer = Transformer(**self.model_cfg)
Esempio n. 25
0
def create_src_masks(src, SRC_SEQ_LEN, TEXT, use_srcmask=False):
    if use_srcmask:
        src_mask = Transformer.generate_square_subsequent_mask(SRC_SEQ_LEN).to(
            device)
    else:
        src_mask = None
    src_key_padding_mask = (src == TEXT.vocab.stoi['<pad>']).bool().to(device)
    memory_key_padding_mask = (
        src == TEXT.vocab.stoi['<pad>']).bool().to(device)
    return src_mask, src_key_padding_mask, memory_key_padding_mask
Esempio n. 26
0
def load_model(checkpoint, device):
    model_args = checkpoint["settings"]

    model = Transformer(
        model_args["embedding_size"],
        model_args["src_vocab_size"],
        model_args["tgt_vocab_size"],
        model_args["src_pad_idx"],
        model_args["num_heads"],
        model_args["num_encoder_layers"],
        model_args["num_decoder_layers"],
        model_args["forward_expansion"],
        model_args["dropout"],
        model_args["max_len"],
        model_args["device"],
    ).to(device)

    model.load_state_dict(checkpoint["state_dict"])
    print("[Info] Trained model state loaded.")
    return model
Esempio n. 27
0
def create_model():
    transformer = Transformer(
        opt.num_layers,
        opt.d_model,
        opt.num_heads,
        opt.dff,
        encoder_vocab_size,
        decoder_vocab_size,
        pe_input=encoder_vocab_size,
        pe_target=decoder_vocab_size,
    )
    return transformer
Esempio n. 28
0
def main():
    parser = argparse.ArgumentParser(description="Train the model")
    parser.add_argument('-data', required=True)
    parser.add_argument('-epoch', type=int, default=10)
    parser.add_argument('-batch_size', type=int, default=64)
    parser.add_argument('-d_model', type=int, default=512)
    parser.add_argument('-no_cuda', action='store_true')

    opt = parser.parse_args()
    opt.cuda = not opt.no_cuda
    opt.d_word_vec = opt.d_model

    # Load data
    data = torch.load(opt.data)

    opt.max_token_seq_len = data['settings'].max_word_seq_len + 2

    training_data, validation_data = prepare_dataloaders(data, opt)

    opt.src_vocab_size = training_data.dataset.src_vocab_size
    opt.tgt_vocab_size = training_data.dataset.tgt_vocab_size

    print(opt)
    # opt.cuda = True
    device = torch.device('cuda' if opt.cuda else 'cpu')

    # TODO: Fill the code
    transformer = Transformer(d_word_embedding=opt.d_word_vec,
                              d_h=opt.d_model,
                              d_s=opt.d_model,
                              src_vocab_size=opt.src_vocab_size,
                              tgt_vocab_size=opt.tgt_vocab_size,
                              max_sent_len=opt.max_token_seq_len).to(device)

    optimizer = optim.Adam(filter(lambda x: x.requires_grad,
                                  transformer.parameters()),
                           betas=(0.9, 0.98),
                           eps=1e-09)

    train(transformer, training_data, validation_data, optimizer, device, opt)
Esempio n. 29
0
def main(gpu_id=None):
    dataset = Dataset(transform=transform, n_datas=10000)
    pad_vec = np.zeros(len(dataset.human_vocab))
    pad_vec[dataset.human_vocab['<pad>']] = 1
    dataloader = torch.utils.data.DataLoader(dataset=dataset,
                                             batch_size=6,
                                             shuffle=True,
                                             num_workers=6,
                                             collate_fn=partial(
                                                 collate_fn, pad_vec))

    model = Transformer(n_head=2)
    if gpu_id is not None:
        print('use gpu')
        os.environ["CUDA_VISIBLE_DEVICES"] = gpu_id
        n_gpus = torch.cuda.device_count()
        # print('use %d gpu [%s]' % (n_gpus, gpu_id))
        model = model.cuda()
        # model = torch.nn.DataParallel(model, device_ids=[i for i in range(n_gpus)])
    # loss_fn = torch.nn.CrossEntropyLoss()
    loss_fn = torch.nn.MSELoss()

    optimizer = torch.optim.Adam(model.parameters())

    model = sl.load_model('./checkpoint', -1, model)
    optimizer = sl.load_optimizer('./checkpoint', -1, optimizer)

    try:
        trained_epoch = sl.find_last_checkpoint('./checkpoint')
        print('train form epoch %d' % (trained_epoch + 1))
    except Exception as e:
        print('train from the very begining, {}'.format(e))
        trained_epoch = -1
    for epoch in range(trained_epoch + 1, 20):
        train(model,
              loss_fn,
              optimizer,
              dataloader,
              epoch,
              use_gpu=True if gpu_id is not None else False)
Esempio n. 30
0
def init_training(args):
    """ Initialize training process """

    # load vocabulary
    vocab = torch.load(args.vocab)

    # build model
    transformer = Transformer(args, vocab)

    # if finetune
    if args.finetune:
        print("[Finetune] %s" % args.finetune_model_path)
        transformer.load_state_dict(torch.load(args.finetune_model_path))

    # vocab_mask for masking padding
    vocab_mask = torch.ones(len(vocab.tgt))
    vocab_mask[vocab.tgt[constants.PAD_WORD]] = 0

    # loss object
    cross_entropy_loss = nn.CrossEntropyLoss(weight=vocab_mask,
                                             size_average=False)

    if args.cuda:
        transformer = transformer.cuda()
        cross_entropy_loss = cross_entropy_loss.cuda()

    if args.optimizer == "Warmup_Adam":
        optimizer = ScheduledOptim(
            torch.optim.Adam(transformer.get_trainable_parameters(),
                             betas=(0.9, 0.98),
                             eps=1e-09), args.d_model, args.n_warmup_steps)

    if args.optimizer == "Adam":
        optimizer = torch.optim.Adam(
            params=transformer.get_trainable_parameters(),
            lr=args.lr,
            betas=(0.9, 0.98),
            eps=1e-8)

    if args.optimizer == 'SGD':
        optimizer = torch.optim.SGD(
            params=transformer.get_trainable_parameters(), lr=args.lr)

    # multi gpus
    if torch.cuda.device_count() > 1:
        print("[Multi GPU] using", torch.cuda.device_count(), "GPUs\n")
        transformer = nn.DataParallel(transformer)

    return vocab, transformer, optimizer, cross_entropy_loss