Beispiel #1
0
def evaluate():
    # Load model
    weight_path = 'model/09031344_epoch_4_train_loss_3.7933.h5'

    # Load data
    X, Sources, Targets = load_test_data()
    de2idx, idx2de = load_de_vocab()
    en2idx, idx2en = load_en_vocab()

    model = TransformerModel(in_vocab_len=len(idx2de),
                             out_vocab_len=len(idx2en),
                             max_len=hp.maxlen)
    model.load_model(weight_path)

    for i in range(len(X) // hp.batch_size):
        x = X[i * hp.batch_size:(i + 1) * hp.batch_size]
        sources = Sources[i * hp.batch_size:(i + 1) * hp.batch_size]
        targets = Targets[i * hp.batch_size:(i + 1) * hp.batch_size]

        preds = model.translate(x, idx2en)

        for source, target, pred in zip(sources, targets, preds):
            print('source:', source)
            print('expected:', target)
            print('pred:', pred)
            print()
Beispiel #2
0
def eval(): 
    # Load graph
    g = Graph(is_training=False)
    print("Graph loaded")
    
    # Load data
    X, Sources, Targets = load_test_data()
    de2idx, idx2de = load_de_vocab()
    en2idx, idx2en = load_en_vocab()
     
#     X, Sources, Targets = X[:33], Sources[:33], Targets[:33]
     
    # Start session         
    with g.graph.as_default():    
        sv = tf.train.Supervisor()
        with sv.managed_session(config=tf.ConfigProto(allow_soft_placement=True)) as sess:
            ## Restore parameters
            sv.saver.restore(sess, tf.train.latest_checkpoint(hp.logdir))
            print("Restored!")
              
            ## Get model name
            mname = open(hp.logdir + '/checkpoint', 'r').read().split('"')[1] # model name
             
            ## Inference
            if not os.path.exists('results'): 
                os.mkdir('results')
            with codecs.open("results/" + mname, "w", "utf-8") as fout:
                list_of_refs, hypotheses = [], []
                for i in range(len(X) // hp.batch_size):
                     
                    ### Get mini-batches
                    x = X[i*hp.batch_size: (i+1)*hp.batch_size]
                    sources = Sources[i*hp.batch_size: (i+1)*hp.batch_size]
                    targets = Targets[i*hp.batch_size: (i+1)*hp.batch_size]
                     
                    ### Autoregressive inference
                    preds = np.zeros((hp.batch_size, hp.maxlen), np.int32)  # (32, 10)
                    for j in range(hp.maxlen):
                        _preds = sess.run(g.preds, {g.x: x, g.y: preds})
                        preds[:, j] = _preds[:, j]
                     
                    ### Write to file
                    for source, target, pred in zip(sources, targets, preds): # sentence-wise
                        #print(got)
                        got = " ".join(idx2en[idx] for idx in pred).split("</S>")[0].strip()
                        fout.write("- source: " + source +"\n")
                        fout.write("- expected: " + target + "\n")
                        fout.write("- got: " + got + "\n\n")
                        fout.flush()
                          
                        # bleu score
                        ref = target.split()
                        hypothesis = got.split()
                        if len(ref) > 3 and len(hypothesis) > 3:
                            list_of_refs.append([ref])
                            hypotheses.append(hypothesis)
              
                ## Calculate bleu score
                score = corpus_bleu(list_of_refs, hypotheses)
                fout.write("Bleu Score = " + str(100*score))
Beispiel #3
0
def evaluate_train():
    # Load model
    weight_path = 'model/09031925_epoch_0_train_loss_5.9855.h5'

    # Load data
    Sources, Targets = load_train_data()
    de2idx, idx2de = load_de_vocab()
    en2idx, idx2en = load_en_vocab()
    batch_size = 5

    model = TransformerModel(in_vocab_len=len(idx2de),
                             out_vocab_len=len(idx2en),
                             max_len=hp.maxlen)
    model.load_model(weight_path)

    for i in range(5 // batch_size):
        x = Sources[i * batch_size:(i + 1) * batch_size]
        sources = Sources[i * batch_size:(i + 1) * batch_size]
        targets = Targets[i * batch_size:(i + 1) * batch_size]

        preds = model.translate_with_ans(sources, targets, idx2en)
        # preds = model.translate(x, idx2en)

        for source, target, pred in zip(sources, targets, preds):
            print('source:', ' '.join(idx2de[idx] for idx in source))
            print('expected:', ' '.join(idx2en[idx] for idx in target))
            print('pred:', pred)
            print()
Beispiel #4
0
def syn_train_api():

    de2idx, idx2de = load_de_vocab()
    en2idx, idx2en = load_en_vocab()

    # Construct graph
    g = Graph("train")
    print("Graph loaded")

    with g.graph.as_default():
        sv = tf.train.Supervisor()
        with sv.managed_session(config=tf.ConfigProto(
                allow_soft_placement=True)) as sess:
            ## Restore parameters
            sv.saver.restore(sess, tf.train.latest_checkpoint(model_path))
            print("Restored!")
            # Start session

            if sv.should_stop():
                break
            for step in tqdm(range(g.num_batch),
                             total=g.num_batch,
                             ncols=70,
                             leave=False,
                             unit='b'):
                sess.run(g.train_op)

            loss = sess.run(g.mean_loss)
            print("============loss=========: %f" % loss)
            gs = sess.run(g.global_step)
            sv.saver.save(sess, tf.train.latest_checkpoint(model_path))
            print(sess.run(g.acc))
    print("Done")
Beispiel #5
0
def test(config):
    _config_test(config)

    de2idx, idx2de = load_de_vocab()
    en2idx, idx2en = load_en_vocab()
    
    model = ConvSeq2Seq(config)
    graph_handler = GraphHandler(config)
    inferencer = Inferencer(config, model)
    sess = tf.Session()
    graph_handler.initialize(sess)

    global_step = 0
    refs = []
    hypotheses = []
    with codecs.open(os.path.join(config.eval_dir, config.model_name), "w", "utf-8") as fout:
        for i, batch in tqdm(enumerate(get_batch_for_test())):
            preds = inferencer.run(sess, batch)
            sources = batch['source']
            targets = batch['target']
            for source, target, pred in zip(sources, targets, preds):
                got = " ".join(idx2en[idx] for idx in pred).split("</S>")[0].strip()
                fout.write("- source: " + source +"\n")
                fout.write("- expected: " + target + "\n")
                fout.write("- got: " + got + "\n\n")
                fout.flush()

                ref = target.split()
                hypothesis = got.split()
                if len(ref) > 3 and len(hypothesis) > 3:
                    refs.append([ref])
                    hypotheses.append(hypothesis)

        score = corpus_bleu(refs, hypotheses)
        fout.write("Bleu Score = " + str(100*score))
def train():
    current_batches = 0
    de2idx, idx2de = load_de_vocab()
    en2idx, idx2en = load_en_vocab()
    enc_voc = len(de2idx)
    dec_voc = len(en2idx)
    writer = SummaryWriter()
    # Load data
    X, Y = load_train_data()
    # calc total batch count
    num_batch = len(X) // hp.batch_size
    model = AttModel(hp, enc_voc, dec_voc)
    model.train()
    model.cuda()
    torch.backends.cudnn.benchmark = True
    if not os.path.exists(hp.model_dir):
        os.makedirs(hp.model_dir)
    if hp.preload is not None and os.path.exists(hp.model_dir + '/history.pkl'):
        with open(hp.model_dir + '/history.pkl', 'rb') as in_file:
            history = pickle.load(in_file)
    else:
        history = {'current_batches': 0}
    current_batches = history['current_batches']
    optimizer = optim.Adam(model.parameters(), lr=hp.lr, betas=[0.9, 0.98], eps=1e-8)
    if hp.preload is not None and os.path.exists(hp.model_dir + '/optimizer.pth'):
        optimizer.load_state_dict(torch.load(hp.model_dir + '/optimizer.pth'))
    if hp.preload is not None and os.path.exists(hp.model_dir + '/model_epoch_%02d.pth' % hp.preload):
        model.load_state_dict(torch.load(hp.model_dir + '/model_epoch_%02d.pth' % hp.preload))

    startepoch = int(hp.preload) if hp.preload is not None else 1
    for epoch in range(startepoch, hp.num_epochs + 1):
        current_batch = 0
        for index, current_index in get_batch_indices(len(X), hp.batch_size):
            tic = time.time()
            x_batch = Variable(torch.LongTensor(X[index]).cuda())
            y_batch = Variable(torch.LongTensor(Y[index]).cuda())
            toc = time.time()
            tic_r = time.time()
            torch.cuda.synchronize()
            optimizer.zero_grad()
            loss, _, acc = model(x_batch, y_batch)
            loss.backward()
            optimizer.step()
            torch.cuda.synchronize()
            toc_r = time.time()
            current_batches += 1
            current_batch += 1
            if current_batches % 10 == 0:
                writer.add_scalar('./loss', loss.data.cpu().numpy(), current_batches)
                writer.add_scalar('./acc', acc.data.cpu().numpy(), current_batches)
            if current_batches % 5 == 0:
                print('epoch %d, batch %d/%d, loss %f, acc %f' % (epoch, current_batch, num_batch, loss.data[0], acc.data[0]))
                print('batch loading used time %f, model forward used time %f' % (toc - tic, toc_r - tic_r))
            if current_batches % 100 == 0:
                writer.export_scalars_to_json(hp.model_dir + '/all_scalars.json')
        with open(hp.model_dir + '/history.pkl', 'wb') as out_file:
            pickle.dump(history, out_file)
        checkpoint_path = hp.model_dir + '/model_epoch_%02d' % epoch + '.pth'
        torch.save(model.state_dict(), checkpoint_path)
        torch.save(optimizer.state_dict(), hp.model_dir + '/optimizer.pth')
Beispiel #7
0
def eval():
    # Load data
    X, Sources, Targets = load_test_data()
    de2idx, idx2de = load_de_vocab()
    en2idx, idx2en = load_en_vocab()
    enc_voc = len(de2idx)
    dec_voc = len(en2idx)

    # load model
    model = AttModel(hp, enc_voc, dec_voc)
    model.load_state_dict(
        torch.load(hp.model_dir + '/model_epoch_%02d' % hp.eval_epoch +
                   '.pth'))
    print('Model Loaded.')
    model.eval()
    model.cuda()
    # Inference
    if not os.path.exists('results'):
        os.mkdir('results')
    with codecs.open('results/model%d.txt' % hp.eval_epoch, 'w',
                     'utf-8') as fout:
        list_of_refs, hypotheses = [], []
        for i in range(len(X) // hp.batch_size):
            # Get mini-batches
            x = X[i * hp.batch_size:(i + 1) * hp.batch_size]
            sources = Sources[i * hp.batch_size:(i + 1) * hp.batch_size]
            targets = Targets[i * hp.batch_size:(i + 1) * hp.batch_size]

            # Autoregressive inference
            x_ = torch.LongTensor(x).cuda()
            preds_t = torch.LongTensor(
                np.zeros((hp.batch_size, hp.maxlen), np.int32)).cuda()
            preds = preds_t
            for j in range(hp.maxlen):

                _, _preds, _ = model(x_, preds)
                preds_t[:, j] = _preds.data[:, j]
                preds = preds_t.long()
            preds = preds.data.cpu().numpy()

            # Write to file
            for source, target, pred in zip(sources, targets,
                                            preds):  # sentence-wise
                got = " ".join(idx2en[idx]
                               for idx in pred).split("</S>")[0].strip()
                fout.write("- source: " + source + "\n")
                fout.write("- expected: " + target + "\n")
                fout.write("- got: " + got + "\n\n")
                fout.flush()

                # bleu score
                ref = target.split()
                hypothesis = got.split()
                if len(ref) > 3 and len(hypothesis) > 3:
                    list_of_refs.append([ref])
                    hypotheses.append(hypothesis)
            # Calculate bleu score
            score = corpus_bleu(list_of_refs, hypotheses)
            fout.write("Bleu Score = " + str(100 * score))
Beispiel #8
0
    def __init__(self, transformerModel, output_dir):
        self.transformerModel = transformerModel
        self.output_dir = output_dir
        self.Sources, self.Targets = load_train_data()
        _, self.idx2de = load_de_vocab()
        _, self.idx2en = load_en_vocab()

        os.makedirs(self.output_dir, exist_ok=True)
Beispiel #9
0
def eval(): 
    # Load graph
    g = Graph(is_training=False)
    print("Graph loaded")
    
    # Load data
    X, Sources, Targets = load_test_data()
    de2idx, idx2de = load_de_vocab()
    en2idx, idx2en = load_en_vocab()
     
#     X, Sources, Targets = X[:33], Sources[:33], Targets[:33]
     
    # Start session         
    with g.graph.as_default():    
        sv = tf.train.Supervisor()
        with sv.managed_session(config=tf.ConfigProto(allow_soft_placement=True)) as sess:
            ## Restore parameters
            sv.saver.restore(sess, tf.train.latest_checkpoint(hp.logdir))
            print("Restored!")
              
            ## Get model name
            mname = open(hp.logdir + '/checkpoint', 'r').read().split('"')[1] # model name
             
            ## Inference
            if not os.path.exists('results'): os.mkdir('results')
            with codecs.open("results/" + mname, "w", "utf-8") as fout:
                list_of_refs, hypotheses = [], []
                for i in range(len(X) // hp.batch_size):
                     
                    ### Get mini-batches
                    x = X[i*hp.batch_size: (i+1)*hp.batch_size]
                    sources = Sources[i*hp.batch_size: (i+1)*hp.batch_size]
                    targets = Targets[i*hp.batch_size: (i+1)*hp.batch_size]
                     
                    ### Autoregressive inference
                    preds = np.zeros((hp.batch_size, hp.maxlen), np.int32)
                    for j in range(hp.maxlen):
                        _preds = sess.run(g.preds, {g.x: x, g.y: preds})
                        preds[:, j] = _preds[:, j]
                     
                    ### Write to file
                    for source, target, pred in zip(sources, targets, preds): # sentence-wise
                        got = " ".join(idx2en[idx] for idx in pred).split("</S>")[0].strip()
                        fout.write("- source: " + source +"\n")
                        fout.write("- expected: " + target + "\n")
                        fout.write("- got: " + got + "\n\n")
                        fout.flush()
                          
                        # bleu score
                        ref = target.split()
                        hypothesis = got.split()
                        if len(ref) > 3 and len(hypothesis) > 3:
                            list_of_refs.append([ref])
                            hypotheses.append(hypothesis)
              
                ## Calculate bleu score
                score = corpus_bleu(list_of_refs, hypotheses)
                fout.write("Bleu Score = " + str(100*score))
Beispiel #10
0
def eval2(): 
    # 加载
    g = Graph(is_training=False)
    print("Graph loaded")
    
    # 加载数据
    X, Sources, Targets = load_test_data1()
    de2idx, idx2de = load_de_vocab()
    en2idx, idx2en = load_en_vocab()
     
     
    # 开始阶段       
    with g.graph.as_default():    
        sv = tf.train.Supervisor()
        with sv.managed_session(config=tf.ConfigProto(allow_soft_placement=True)) as sess:
            ## 恢复参数
            sv.saver.restore(sess, tf.train.latest_checkpoint(hp.logdir))
            print("Restored!")
              
            ## 模型名取得
            mname = open(hp.logdir + '/checkpoint', 'r').read().split('"')[1] # model name
             
            ## Inference
            if not os.path.exists('results'): os.mkdir('results')
            with codecs.open("results/eval2", "w", "utf-8") as fout:
                list_of_refs, hypotheses = [], []
                for i in range(len(X) // hp.batch_size):
                     
                    ### 获取最小Batch
                    x = X[i*hp.batch_size: (i+1)*hp.batch_size]
                    sources = Sources[i*hp.batch_size: (i+1)*hp.batch_size]
                    targets = Targets[i*hp.batch_size: (i+1)*hp.batch_size]
                     
                    ### 自回归
                    preds = np.zeros((hp.batch_size, hp.maxlen), np.int32)
                    for j in range(hp.maxlen):
                        _preds = sess.run(g.preds, {g.x: x, g.y: preds})
                        preds[:, j] = _preds[:, j]
                     
                    ### 写
                    for source, target, pred in zip(sources, targets, preds): # sentence-wise
                        got = " ".join(idx2en[idx] for idx in pred).split("</S>")[0].strip()
                        fout.write("- source: " + source +"\n")
                        #fout.write("- expected: " + target + "\n")
                        fout.write("- got: " + got + "\n\n")
                        print("- source: " + source +"\n")
                        #print("- expected: " + target + "\n")
                        print("- got: " + got + "\n\n")
                        fout.flush()
                          
                        #得分
                        ref = target.split()
                        hypothesis = got.split()
                        if len(ref) > 3 and len(hypothesis) > 3:
                            list_of_refs.append([ref])
                            hypotheses.append(hypothesis)
Beispiel #11
0
def train():
    # Load graph
    g = Graph(is_training=True)
    de2idx, idx2de = load_de_vocab()
    en2idx, idx2en = load_en_vocab()
    print("Graph loaded")
    # Load data
    X, Image_index, _, X_target = load_cap_data(set="en")
    images = np.load(image_path.format("train"))
    # x_val,Image_index_val,_,Targets = load_test_cap_data(set="test",language="en")
    # val_images = np.load("../image/task1_ResNet50_res4fx_test2017.fp16.npy")
    # num_batch_val = len(x_val)//hp.batch_size
    # smoothie = SmoothingFunction().method2
    # Start session
    num_batch = int(math.ceil(len(X) / hp.batch_size))
    if not os.path.exists(hp.logdir_cap_en): os.mkdir(hp.logdir_cap_en)
    with g.graph.as_default():
        saver = tf.train.Saver(var_list=g.value_list, max_to_keep=40)
        with tf.Session(config=tf.ConfigProto(
                allow_soft_placement=True)) as sess:
            sess.run(tf.global_variables_initializer())
            # saver.restore(sess, tf.train.latest_checkpoint("logdir_en2"))
            # print("Restored!")
            ## train
            for epoch in range(hp.num_epochs):
                for i in range(num_batch):
                    lr = hp.lr_cap * pow(0.95, epoch)
                    step = epoch * num_batch + i
                    ### Get mini-batches
                    image = images[Image_index[i * hp.batch_size:(i + 1) *
                                               hp.batch_size]]
                    x = X[i * hp.batch_size:(i + 1) * hp.batch_size]
                    x_target = X_target[i * hp.batch_size:(i + 1) *
                                        hp.batch_size]
                    feed_dict = {
                        g.x: x,
                        g.image: image,
                        g.dropout_rate: hp.dropout_rate,
                        g.lstm_drop_rate: hp.lstm_drop_rate,
                        g.lr: lr,
                        g.x_target: x_target
                    }
                    if i % 1000 == 0:
                        _, loss, preds = sess.run(
                            [g.train_op, g.loss, g.preds_list], feed_dict)
                        with open("en.txt", "a+") as f:
                            f.write("loss {}".format(i) + " " + str(loss))
                    else:
                        sess.run(g.train_op, feed_dict)
                    if (step + 1) % 1000 == 0:
                        saver.save(sess,
                                   save_path=hp.logdir_cap_en +
                                   '/model_step_%d' % (step))
 def __init__(self, is_training):
     self.de2idx, _idx2de = load_de_vocab()
     self.en2idx, _idx2en = load_en_vocab()
     self.is_training = is_training
     self.graph = tf.Graph()
     with self.graph.as_default():
         if self.is_training:
             self.x, self.y, self.num_batch = get_batch_data() # (N, T)
         else: # inference
             self.x = tf.placeholder(tf.int32, shape=(None, hp.maxlen))
             self.y = tf.placeholder(tf.int32, shape=(None, hp.maxlen))
         self.x_len = tf.reduce_sum(self.x, axis=-1)
         self.y_len = tf.reduce_sum(self.y, axis=-1)
         self.global_step = tf.Variable(0, name='global_step', trainable=False)
         self.batch_size = tf.shape(self.x)[0]
Beispiel #13
0
def eval():
    g = train_Graph(is_training=False)
    print('Graph loaded')
    X, Sources, Targets = load_test_data()
    de2idx, idx2de = load_de_vocab()
    en2idx, idx2en = load_en_vocab()
    with g.graph.as_default():
        sv = tf.train.Supervisor()
        with sv.managed_session(config=tf.ConfigProto(allow_soft_placement=True)) as sess:
            sv.saver.restore(sess, tf.train.latest_checkpoint(hp.logdir))
            print('Restored')
            mname = open(hp.logdir + '/checkpoint', 'r').read().split('')[1]  # model name

            if not os.path.exists('results'): os.mkdir('results')
            with codecs.open('results/' + mname, 'w', 'utf-8') as fout:
                list_of_refs, hypotheses = [], []
                for i in range(len(X) // hp.batch_size):
                    x = X[i*hp.batch_size:(i+1)*hp.batch_size]
                    sources = Sources[i*hp.batch_size:(i+1)*hp.batch_size]
                    targets = Targets[i*hp.batch_size:(i+1)*hp.batch_size]

                    preds = np.zeros((hp.batch_size, hp.max_seq_len), np.int32)
                    for j in range(hp.max_seq_len):
                        '''每个词每个词地预测。这样,后一个词预测的时候就可以利用前面的信息来解码。
                        所以一共循环hp.max_len次,每次循环用之前的翻译作为解码器的输入翻译的一个词。'''
                        _preds = sess.run(g.preds, {g.x:x, g.y:preds})
                        preds[:, j] = _preds[:, j]

                    for source, target, pred in zip(sources, targets, preds):
                        got = ''.join(idx2en[idx] for idx in pred).split('</S>')[0].strip()
                        fout.write('-source:' + source + '\n')
                        fout.write('-expected:' + target + '\n')
                        fout.write('-got:' + got + '\n\n')
                        fout.flush()

                        # bleu score
                        ref = target.split()
                        hypothesis = got.split()
                        if len(ref) > 3 and len(hypothesis) > 3:
                            list_of_refs.append([ref])
                            hypotheses.append(hypothesis)
                        score = corpus_bleu(list_of_refs, hypotheses)
                        fout.write('Bleu Score = ' + str(100*score))
Beispiel #14
0
def create_data(input_sent):
    de2idx, idx2de = load_de_vocab()
    en2idx, idx2en = load_en_vocab()
    x = []
    y = []
    if len(input_sent) < (hp.maxlen - 1):

        x.append(de2idx.get("<S>", 1))
        for each in input_sent:
            x.append(de2idx.get(each, 1))
        x.append(de2idx.get("</S>", 1))
        y.append(np.array(x))
        y = np.array(y)
    print(y.shape)
    Input = []
    Input.append(input_sent)

    X = np.zeros([len(y), hp.maxlen], np.int32)
    print(X.shape)
    X[0] = np.lib.pad(y[0], [0, hp.maxlen - len(y[0])],
                      'constant',
                      constant_values=0)
    print(X.shape)
    return X, Input
Beispiel #15
0
    def __init__(self, is_training=True):
        self.graph = tf.Graph()
        with self.graph.as_default():
            if is_training:
                self.x, self.y, self.num_batch = get_batch_data(
                )  # shape=[batch_size, max_seq_len]
            else:
                self.x = tf.placeholder(tf.int32, shape=(None, hp.max_seq_len))
                self.y = tf.placeholder(tf.int32, shape=(None, hp.max_seq_len))
            # decoder_inputs
            '''decoder_inputs和self.y相比,去掉了最后一个句子结束符,而在每句话最前面加了一个初始化为2的id,即<S> ,代表开始。'''
            self.decoder_inputs = tf.concat(
                (tf.ones_like(self.y[:, :1]) * 2, self.y[:, :-1]), axis=-1)
            # load_vocab
            de2idx, idx2de = load_de_vocab()
            en2idx, idx2en = load_en_vocab()

            # encoder
            with tf.variable_scope('encoder'):
                # input - word embedding
                self.enc = embedding(self.x,
                                     vocab_size=len(de2idx),
                                     d_model=hp.d_model,
                                     scale=True,
                                     scope='enc_embed')
                # input - positional encoding
                self.enc += embedding(tf.tile(
                    tf.expand_dims(tf.range(tf.shape(self.x)[1]), 0),
                    [tf.shape(self.x)[0], 1]),
                                      vocab_size=hp.max_seq_len,
                                      d_model=hp.d_model,
                                      zero_pad=False,
                                      scale=False,
                                      scope='enc_pe')
                # Dropout
                self.enc = tf.layers.dropout(
                    self.enc,
                    rate=hp.dropout_rate,
                    training=tf.convert_to_tensor(is_training))
                # 3. num_layers multi-head attention
                for i in range(hp.num_layers):
                    with tf.variable_scope('num_layers_{}'.format(i)):
                        # multi head attention + Add and Norm
                        self.enc = multihead_attention(
                            queries=self.enc,
                            keys=self.enc,
                            d_model=hp.d_model,
                            num_heads=hp.num_heads,
                            dropout_rate=hp.dropout_rate,
                            is_training=is_training,
                            causality=False)
                        # feed forward + Add and Norm
                        self.enc = feedforward(
                            self.enc, dff=[4 * hp.d_model, hp.d_model])

            # decoder
            with tf.variable_scope('decoder'):
                self.dec = embedding(self.decoder_inputs,
                                     vocab_size=len(en2idx),
                                     d_model=hp.d_model,
                                     scale=True,
                                     scope='dec_embed')
                self.dec += embedding(tf.tile(
                    tf.expand_dims(tf.range(tf.shape(self.decoder_inputs)[1]),
                                   0), [tf.shape(self.decoder_inputs)[0], 1]),
                                      vocab_size=hp.max_seq_len,
                                      d_model=hp.d_model,
                                      zero_pad=False,
                                      scale=False,
                                      scope='dec_pe')
                self.dec = tf.layers.dropout(
                    self.dec,
                    rate=hp.dropout_rate,
                    training=tf.convert_to_tensor(is_training))
                for i in range(hp.num_layers):
                    with tf.variable_scope('num_layers_{}'.format(i)):
                        # masked multi-head attention
                        self.dec = multihead_attention(
                            queries=self.dec,
                            keys=self.dec,
                            d_model=hp.d_model,
                            num_heads=hp.num_heads,
                            dropout_rate=hp.dropout_rate,
                            is_training=is_training,
                            causality=True,
                            scope='self-attention')
                        # multi-head attention
                        self.dec = multihead_attention(
                            queries=self.dec,
                            keys=self.enc,
                            d_model=hp.d_model,
                            num_heads=hp.num_heads,
                            dropout_rate=hp.dropout_rate,
                            is_training=is_training,
                            causality=False,
                            scope='vanilla-attention')
                        self.dec = feedforward(
                            self.dec,
                            dff=[4 * hp.d_model, hp.d_model
                                 ])  # shape=[batch_size, seq_len, d_model]

            # final linear projection
            self.logits = tf.layers.dense(
                self.dec,
                len(en2idx))  # shape=[batch_size, seq_len, target_vocab_size]
            self.preds = tf.to_int32(tf.arg_max(
                self.logits, dimension=-1))  # 预测值 shape=[batch_size, seq_len]
            self.istarget = tf.to_float(tf.not_equal(
                self.y, 0))  # 真实值 shape=[batch_size, seq_len]
            # pad 部分不参与准确率计算
            self.acc = tf.reduce_sum(
                tf.to_float(tf.equal(self.preds, self.y)) *
                self.istarget) / tf.reduce_sum(self.istarget)
            tf.summary.scalar('acc', self.acc)

            if is_training:
                # loss
                self.y_smoothed = label_smoothing(
                    tf.one_hot(self.y, depth=len(en2idx)))
                self.loss = tf.nn.softmax_cross_entropy_with_logits(
                    logits=self.logits, labels=self.y_smoothed)
                # pad 部分不参与损失计算
                self.mean_loss = tf.reduce_sum(
                    self.loss * self.istarget) / (tf.reduce_sum(self.istarget))
                # training scheme
                self.global_step = tf.Variable(0,
                                               name='global_step',
                                               trainable=False)
                self.optimizer = tf.train.AdamOptimizer(learning_rate=hp.lr,
                                                        beta1=0.9,
                                                        beta2=0.98,
                                                        epsilon=1e-8)
                self.train_op = self.optimizer.minimize(
                    self.mean_loss, global_step=self.global_step)
                # summary
                tf.summary.scalar('mean_loss', self.mean_loss)
                self.merged = tf.summary.merge_all()
    def __init__(self, is_training=True):
        self.graph = tf.Graph()

        with self.graph.as_default():
            if is_training:
                self.x, self.y, self.num_batch = get_batch_data()
            else:
                self.x = tf.placeholder(tf.int32, shape=(None, hp.maxlen))
                self.y = tf.placeholder(tf.int32, shape=(None, hp.maxlen))

            # define decoder inputs
            # id = 2代表<S>,是decoder的初始输入,这一步把正常的y向量做转换,比如y = [["i", "love", "china", "deeply"], ["can", "you", "speak", "chinese"]]修改为
            # [["<s>", "i", "love", "china"], ["<s>, "can", "you", "speak"]], 这部分将在decoder阶段,最先输入self-attention部分
            # 在训练阶段,decoder_inputs如上,在inference阶段,由于无法获知真正的y,所以y输入的是shape=[batch_size, max_length]的全0向量。
            # 处理之后旧变成[["<s>", 0, 0, 0]]这样子,每次值取第一个预测结果,循环输入再取前两个结果
            self.decoder_inputs = tf.concat(
                (tf.ones_like(self.y[:, :1]) * 2, self.y[:, :-1]), -1)

            de2idx, idx2de = load_de_vocab()
            en2idx, idx2en = load_en_vocab()

            with tf.variable_scope("encoder"):
                # Embedding
                self.enc = embedding(
                    self.x,
                    vocab_size=len(de2idx),
                    num_units=hp.hidden_units,
                    zero_pad=
                    True,  # id为0的行表示padding的embedding, true表示将这一行置0(随机初始化出来的可能不是0)
                    scale=True,
                    scope="enc_embed")

                ## Positional Encoding
                if hp.sinusoid:
                    self.enc += positional_encoding(self.x,
                                                    num_units=hp.hidden_units,
                                                    zero_pad=False,
                                                    scale=False,
                                                    scope='enc_pe')

                else:
                    self.enc += embedding(tf.tile(
                        tf.expand_dims(tf.range(tf.shape(self.x)[1]), 0),
                        [tf.shape(self.x)[0], 1]),
                                          vocab_size=hp.maxlen,
                                          num_units=hp.hidden_units,
                                          zero_pad=False,
                                          scale=False,
                                          scope="enc_pe")

                ## Dropout
                self.enc = tf.layers.dropout(
                    self.enc,
                    rate=hp.dropout_rate,
                    training=tf.convert_to_tensor(is_training))

                ## Blocks, 叠加block,6个
                for i in range(hp.num_blocks):
                    with tf.variable_scope("num_blocks_{}".format(i)):
                        ### MultiHead Attention
                        self.enc = multihead_attention(
                            queries=self.enc,
                            keys=self.enc,
                            num_units=hp.hidden_units,
                            num_heads=hp.num_heads,
                            dropout_rate=hp.dropout_rate,
                            is_training=is_training,
                            causality=False)
                        self.enc = feedforward(
                            self.enc,
                            num_units=[4 * hp.hidden_units, hp.hidden_units])

            with tf.variable_scope("decoder"):
                # Embedding
                self.dec = embedding(self.decoder_inputs,
                                     vocab_size=len(en2idx),
                                     num_units=hp.hidden_units,
                                     scale=True,
                                     scope="dec_embed")

                # Positional Encoding
                if hp.sinusoid:
                    self.dec += positional_encoding(self.decoder_inputs,
                                                    num_units=hp.hidden_units,
                                                    zero_pad=False,
                                                    scale=False,
                                                    scope="dec_pe")
                else:
                    self.dec += embedding(tf.tile(
                        tf.expand_dims(
                            tf.range(tf.shape(self.decoder_inputs)[1]), 0),
                        [tf.shape(self.decoder_inputs)[0], 1]),
                                          vocab_size=hp.maxlen,
                                          num_units=hp.hidden_units,
                                          zero_pad=False,
                                          scale=False,
                                          scope="dec_pe")

                # Dropout
                self.dec = tf.layers.dropout(
                    self.dec,
                    rate=hp.dropout_rate,
                    training=tf.convert_to_tensor(is_training))

                # Blocks
                for i in range(hp.num_blocks):
                    with tf.variable_scope("num_blocks_{}".format(i)):
                        ## Multihead Attention ( self-attention)
                        self.dec = multihead_attention(
                            queries=self.dec,
                            keys=self.dec,
                            num_units=hp.hidden_units,
                            num_heads=hp.num_heads,
                            dropout_rate=hp.dropout_rate,
                            is_training=is_training,
                            causality=True,
                            scope="self_attention")

                        ## Multihead Attention ( vanilla attention)
                        self.dec = multihead_attention(
                            queries=self.dec,
                            keys=self.enc,
                            num_units=hp.hidden_units,
                            num_heads=hp.num_heads,
                            dropout_rate=hp.dropout_rate,
                            is_training=is_training,
                            causality=False,
                            scope="vanilla_attention")

                        ## Feed Forward
                        self.dec = feedforward(
                            self.dec,
                            num_units=[4 * hp.hidden_units, hp.hidden_units])

            # Final linear projection, 分类任务,分类数量是词表长度
            self.logits = tf.layers.dense(self.dec, len(en2idx))
            self.preds = tf.to_int32(tf.argmax(self.logits, dimension=-1))
            self.istarget = tf.to_float(tf.not_equal(self.y, 0))
            self.acc = tf.reduce_sum(
                tf.to_float(tf.equal(self.preds, self.y)) * self.istarget /
                (tf.reduce_sum(self.istarget)))

            if is_training:
                # Loss
                # 将one_hot中的0改成了一个很小的数,1改成了一个比较接近于1的数。
                self.y_smoothed = label_smoothing(
                    tf.one_hot(self.y, depth=len(en2idx)))
                self.loss = tf.nn.softmax_cross_entropy_with_logits(
                    logits=self.logits, labels=self.y_smoothed)
                self.mean_loss = tf.reduce_sum(
                    self.loss * self.istarget) / (tf.reduce_sum(self.istarget))

                self.global_step = tf.Variable(0,
                                               name='global_step',
                                               trainable=False)
                self.optimizer = tf.train.AdamOptimizer(learning_rate=hp.lr,
                                                        beta1=0.9,
                                                        beta2=0.98,
                                                        epsilon=1e-8)
                self.train_op = self.optimizer.minimize(
                    self.mean_loss, global_step=self.global_step)

                tf.summary.scalar('mean_loss', self.mean_loss)
                self.merged = tf.summary.merge_all()
Beispiel #17
0
                                                            reduction='none')


def loss_fun(y_true, y_pred):
    mask = tf.math.logical_not(tf.math.equal(y_true, 0))
    loss_ = loss_object(y_true, y_pred)
    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask
    return tf.reduce_mean(loss_)


# 用于记录损失和准确率
train_loss = tf.keras.metrics.Mean(name='train_loss')
train_acc = tf.keras.metrics.SparseCategoricalAccuracy(name='train_accuracy')

de2index, index2de = load_de_vocab()
en2index, index2en = load_en_vocab()
input_vocab_size = len(de2index)
target_vocab_size = len(en2index)

transformer = Transformer(hp.d_model, hp.num_layers, hp.num_heads, hp.dff,
                          input_vocab_size, target_vocab_size, hp.max_seq_len,
                          hp.dropout_rate)

# 创建checkpoint管理器
ckpt = tf.train.Checkpoint(transformer=transformer, optimizer=optimizer)
ckpt_manager = tf.train.CheckpointManager(ckpt, hp.ckpt_path, max_to_keep=3)
if ckpt_manager.latest_checkpoint:
    ckpt.restore(ckpt_manager.latest_checkpoint)
    print('Load last checkpoint restore')
'''
Beispiel #18
0
def eval():
    # Load graph
    g = Graph(is_training=False)
    print("Graph loaded")

    # Load data
    X, Sources, Targets = load_test_data()
    de2idx, idx2de = load_de_vocab()
    en2idx, idx2en = load_en_vocab()

    #     X, Sources, Targets = X[:33], Sources[:33], Targets[:33]

    # Start session
    with g.graph.as_default():
        sv = tf.train.Supervisor()
        with sv.managed_session(config=tf.ConfigProto(
                allow_soft_placement=True)) as sess:
            ## Restore parameters
            sv.saver.restore(sess, tf.train.latest_checkpoint(hp.logdir))
            print("Restored!")

            ## Get model name
            mname = open(hp.logdir + '/checkpoint',
                         'r').read().split('"')[1]  # model name

            ## Inference
            if not os.path.exists('results'): os.mkdir('results')
            with codecs.open("results/" + mname, "w", "utf-8") as fout:
                print("open reults success\n")
                list_of_refs, hypotheses = [], []
                print("length of batch is" + str(len(X) // hp.batch_size))
                for i in range(len(X) // hp.batch_size):
                    print('translating')
                    ### Get mini-batches
                    x = X[i * hp.batch_size:(i + 1) * hp.batch_size]
                    sources = Sources[i * hp.batch_size:(i + 1) *
                                      hp.batch_size]
                    targets = Targets[i * hp.batch_size:(i + 1) *
                                      hp.batch_size]

                    ### Autoregressive inference
                    # 循环结束后,这个batch的句子的翻译保存在preds中
                    preds = np.zeros((hp.batch_size, hp.maxlen), np.int32)
                    for j in range(hp.maxlen):
                        _preds = sess.run(g.preds, {g.x: x, g.y: preds})
                        preds[:, j] = _preds[:, j]

                    ### Write to file
                    # 翻译完成后把句子的翻译保存到preds中
                    for source, target, pred in zip(sources, targets,
                                                    preds):  # sentence-wise
                        #将pred(preds中的每个句子)的每个id转化为其对应的而英文单词,然后将这些单词字符串用一个空格字符链接起来,同时去掉句尾结束符。即得到了翻译的由词组成的句子。
                        got = " ".join(
                            idx2en[idx]
                            for idx in pred).split("</S>")[0].strip()
                        # 分别将原句子、期望翻译的结果、实际翻译的结果写入文件
                        fout.write("- source: " + source + "\n")
                        print('\n' + '\n' + '\n' + source + '\n' + '\n' + '\n')
                        fout.write("- expected: " + target + "\n")
                        fout.write("- got: " + got + "\n\n")
                        fout.flush()

                        # bleu score

                        ref = target.split()
                        hypothesis = got.split()

                        if len(ref) > 3 and len(hypothesis) > 3:
                            list_of_refs.append([ref])
                            hypotheses.append(hypothesis)

                ## Calculate bleu score
                # 最后计算bleu score并写入文件
                # 将两者长度都大于3的句子加入到总的列表中,作为计算Bleu的参数,由此得到bleu socre.可以用以评估模型。
                str_hyp = ",".join(hypotheses)
                print("len of hypothese is :" + len(hypotheses))
                score = corpus_bleu(list_of_refs, hypotheses)
                fout.write("Bleu Score = " +
                           str(100 * score))  #将bleu score写入文件末尾
Beispiel #19
0
def eval():
    # Load graph
    g = Graph(is_training=False)
    print("Graph loaded")

    # Load data
    X, Sources, Targets = load_test_data()
    en2idx, idx2en = load_en_vocab()
    de2idx, idx2de = load_de_vocab()

    # Start session
    with g.graph.as_default():
        sv = tf.train.Supervisor()
        with sv.managed_session(config=tf.ConfigProto(
                allow_soft_placement=True)) as sess:
            # Restore parameters
            sv.saver.restore(sess, tf.train.latest_checkpoint(hp.logdir))
            print("Restored!")

            # Get model name
            mname = open(hp.logdir + '/checkpoint',
                         'r').read().split('"')[1]  # model name

            # Inference
            if not os.path.exists('results'):
                os.mkdir('results')
            with codecs.open("results/" + mname, "w", "utf-8") as fout:
                list_of_refs, hypotheses = [], []
                for i in range(len(X) // hp.batch_size):

                    # Get mini-batches
                    x = X[i * hp.batch_size:(i + 1) * hp.batch_size]
                    sources = Sources[i * hp.batch_size:(i + 1) *
                                      hp.batch_size]
                    targets = Targets[i * hp.batch_size:(i + 1) *
                                      hp.batch_size]

                    # Autoregressive inference
                    preds = np.zeros((hp.batch_size, hp.maxlen), np.int32)
                    for j in range(hp.maxlen):
                        tensors = [g.preds] + list(
                            g.tensors_of_interest.values())
                        tensors_out = sess.run(tensors, {g.x: x, g.y: preds})
                        _preds = tensors_out[0]
                        preds[:, j] = _preds[:, j]

                        print([idx2de[idx] for idx in preds[0]])

                        # For the first few batches, we save figures giving the attention structure in the encoder.
                        if j == 0 and i < batches_to_visualize:
                            tensor_keys = [None] + list(
                                g.tensors_of_interest.keys()
                            )  # Add a null key at the start so it lines up with the tensors_out list
                            visualizeEncoderAttention(
                                sources=sources,
                                idx2en=idx2en,
                                tensors_of_interest={
                                    key: value
                                    for key, value in zip(
                                        tensor_keys, tensors_out)
                                },
                                batch_index=i)

                    # Write to file
                    for source, target, pred in zip(sources, targets,
                                                    preds):  # sentence-wise
                        got = " ".join(
                            idx2de[idx]
                            for idx in pred).split("</S>")[0].strip()
                        fout.write("- source: " + source + "\n")
                        fout.write("- expected: " + target + "\n")
                        fout.write("- got: " + got + "\n\n")
                        fout.flush()

                        # bleu score
                        ref = target.split()
                        hypothesis = got.split()
                        if len(ref) > 3 and len(hypothesis) > 3:
                            list_of_refs.append([ref])
                            hypotheses.append(hypothesis)

                # Calculate bleu score
                score = corpus_bleu(list_of_refs, hypotheses)
                fout.write("Bleu Score = " + str(100 * score))
Beispiel #20
0
                self.y_smoothed = label_smoothing(tf.one_hot(self.y, depth=len(en2idx)))
                self.loss = tf.nn.softmax_cross_entropy_with_logits(logits=self.logits, labels=self.y_smoothed)
                self.mean_loss = tf.reduce_sum(self.loss*self.istarget) / (tf.reduce_sum(self.istarget))
               
                # Training Scheme
                self.global_step = tf.Variable(0, name='global_step', trainable=False)
                self.optimizer = tf.train.AdamOptimizer(learning_rate=hp.lr, beta1=0.9, beta2=0.98, epsilon=1e-8)
                self.train_op = self.optimizer.minimize(self.mean_loss, global_step=self.global_step)
                   
                # Summary 
                tf.summary.scalar('mean_loss', self.mean_loss)
                self.merged = tf.summary.merge_all()

if __name__ == '__main__':                
    # Load vocabulary    
    de2idx, idx2de = load_de_vocab()
    en2idx, idx2en = load_en_vocab()
    
    # Construct graph
    g = Graph("train"); print("Graph loaded")
    
    # Start session
    sv = tf.train.Supervisor(graph=g.graph, 
                             logdir=hp.logdir,
                             save_model_secs=0)
    with sv.managed_session() as sess:
        for epoch in range(1, hp.num_epochs+1): 
            if sv.should_stop(): break
            for step in tqdm(range(g.num_batch), total=g.num_batch, ncols=70, leave=False, unit='b'):
                sess.run(g.train_op)
                
Beispiel #21
0
def eval():
    # Load graph
    g = Graph(is_training=False)
    print("Graph loaded")

    # Load data
    X, Sources, Targets = load_test_data()
    de2idx, idx2de = load_de_vocab()
    en2idx, idx2en = load_en_vocab()

    #     X, Sources, Targets = X[:33], Sources[:33], Targets[:33]

    # Start session
    with g.graph.as_default():
        sv = tf.train.Supervisor()
        with sv.managed_session(config=tf.ConfigProto(
                allow_soft_placement=True)) as sess:
            ## Restore parameters
            sv.saver.restore(sess, tf.train.latest_checkpoint(hp.logdir))
            print("Restored!")

            ## Get model name
            mname = open(hp.logdir + '/checkpoint',
                         'r').read().split('"')[1]  # model name

            ## Inference
            if not os.path.exists('results'): os.mkdir('results')
            with codecs.open("results/" + mname, "w", "utf-8") as fout:
                list_of_refs, hypotheses = [], []
                for i in range(len(X) // hp.batch_size):
                    ### Get mini-batches
                    x = X[i * hp.batch_size:(i + 1) * hp.batch_size]
                    sources = Sources[i * hp.batch_size:(i + 1) *
                                      hp.batch_size]
                    targets = Targets[i * hp.batch_size:(i + 1) *
                                      hp.batch_size]
                    predx = np.zeros((hp.batch_size, hp.maxlen, hp.beam_width),
                                     np.int32)
                    predx_prob = np.zeros_like(predx, np.float64)

                    logits = np.zeros((hp.batch_size, hp.maxlen, len(en2idx)),
                                      np.float64)
                    print(x[1:2, :])

                    for j in range(
                            hp.batch_size
                    ):  #For testing, the range will be changed to accelerate the testing for j in range(hp.maxlen)
                        print(j)
                        preds_sent = np.zeros((1, hp.maxlen, hp.beam_width))
                        probs_sent = np.zeros_like(preds_sent, np.float64)
                        #probs_ref = np.zeros_like(preds_sent, np.float64)
                        x_a = x[j:j + 1, :]  #input one sentence each time
                        sent_len = x_a[0, :].tolist().index(0)
                        #print(x_a)
                        preds = np.zeros((1, hp.maxlen), np.int32)
                        preds_prob = np.zeros_like(preds, np.float64)
                        _logits = np.array(
                            sess.run(g.logits, {
                                g.x: x_a,
                                g.y: preds
                            }))
                        sent_j = _logits[0, 0]
                        #print(sent_j)
                        sos = sent_j.argsort(
                        )[-1:]  #retrieve the token of first character (Start of sentence)
                        preds[
                            0,
                            0] = sos  #settle the sos token at the beginning of preds
                        sos_prob = sent_j[sos]
                        preds_prob[0, 0] = sos_prob
                        #print(preds[0,0])
                        for bw in range(hp.beam_width):
                            preds_sent[0, 0, bw] = preds[0, 0]
                            probs_sent[0, 0, bw] = preds_prob[0, 0]
                        #print(probs_sent)
                        _logits = np.array(
                            sess.run(g.logits, {
                                g.x: x_a,
                                g.y: preds
                            }))
                        sent_j = _logits[0]
                        word_1 = sent_j[1]
                        word_1 = word_1 + preds_prob[0, 0]
                        top_bw_idx = word_1.argsort()[-hp.beam_width:]
                        #print(top_bw_idx)
                        top_bw_probs = word_1[top_bw_idx]
                        #print(top_bw_probs)
                        for bw in range(hp.beam_width):
                            preds_sent[0, 1, bw] = np.copy(top_bw_idx[bw])
                            #print(top_bw_probs[bw])
                            probs_sent[0, 1, bw] = top_bw_probs[bw]
                        #print(probs_sent)
                        #settle top_bw tokens for the second character (first word)

                        #print(probs_sent)
                        for k in range(
                                2, hp.maxlen):  #this part need special design
                            added_probs = []
                            paths_candidate = []
                            preds_prob_list = []

                            for bw in range(hp.beam_width):
                                preds[0, :] = preds_sent[0, :, bw].copy()

                                preds_prob[0, :] = probs_sent[0, :, bw].copy()
                                #print(preds_prob)

                                if (preds_sent[0, k - 1, bw] == 3):
                                    preds_sent[0, k, bw] = 3
                                    current_path = preds_sent[0, :, bw]
                                    new_path = np.copy(current_path)
                                    new_path[k] = 3

                                    paths_candidate.append(new_path)

                                    preds_prob[0, k] = 0
                                    current_preds_prob = np.copy(preds_prob)
                                    print(current_preds_prob)
                                    added_probs = np.concatenate(
                                        (added_probs,
                                         [np.sum(current_preds_prob[0])]), 0)

                                    preds_prob_list.append(current_preds_prob)

                                if (preds_sent[0, k - 1, bw] != 3):

                                    current_path = preds_sent[0, :, bw]
                                    _logits = np.array(
                                        sess.run(g.logits, {
                                            g.x: x_a,
                                            g.y: preds
                                        }))
                                    sent_j = _logits[0]
                                    word_k = sent_j[
                                        k]  #+np.sum(preds_prob[0]) #log(a*b) = log a + log b
                                    top_bw_idx = word_k.argsort(
                                    )[-hp.beam_width:]

                                    top_bw_probs = sent_j[k][top_bw_idx]

                                    for bmw in range(hp.beam_width):
                                        new_path = np.copy(current_path)
                                        new_path[k] = top_bw_idx[bmw]
                                        current_step_probs = top_bw_probs[bmw]
                                        current_path_probs = np.copy(
                                            preds_prob[0])
                                        current_path_probs[
                                            k] = current_step_probs
                                        added_probs = np.concatenate(
                                            (added_probs,
                                             [np.sum(current_path_probs)]), 0)
                                        #print(new_path)
                                        paths_candidate.append(new_path)
                                        preds_prob_list.append(
                                            current_path_probs)

                                #print("what hell is going on")
                                #print(sub_candidates)
                                #print("this is a =========")

                            a_idx = np.array(
                                added_probs).argsort()[-hp.beam_width:]
                            a_prob = added_probs[a_idx]
                            #print(a_prob)

                            print(preds_prob_list)
                            for bw in range(hp.beam_width):

                                preds_sent[0, :, bw] = np.copy(
                                    paths_candidate[a_idx[bw]])
                                #print(paths_candidate[a_idx[bw]])
                                #print(preds_sent[0, :, bw])

                                probs_sent[0, :,
                                           bw] = np.copy(preds_prob_list[bw])
                                print(probs_sent)

                            #print("probs_sent:")
                            #print(probs_sent)

                        predx[j, :, :] = preds_sent
                        predx_prob[j, :, :] = probs_sent
                        #print("checkpoint")
                        #sys.exit()
    ### Write to file

                    print("done")
                    for source, target, pred, prob in zip(
                            sources, targets, predx,
                            predx_prob):  # sentence-wise
                        candits = []
                        candits_probs = []
                        for i in range(hp.beam_width):
                            pres = pred[:, i]
                            pros = prob[:, i]
                            got = "".join(
                                idx2en[idx]
                                for idx in pres).split("</S>")[0].strip()
                            candits.append(got)
                            candits_probs.append(pros)

                        fout.write("- source:   " + source + "\n")
                        fout.write("- expected: " + target + "\n")
                        print(candits)

                        for i in range(len(candits)):
                            fout.write("- got:      " + candits[i] + "\n")
                            m = len(candits[i])
                            fout.write(' '.join(
                                str(each)
                                for each in candits_probs[i].tolist()
                                [:m - 2]))  #each for each in
                            fout.write("\n")
                        fout.write("\n")

                        fout.flush()

                        # bleu score
                        ref = target.split()
                        hypothesis = got.split()
                        if len(ref) > 3 and len(hypothesis) > 3:
                            list_of_refs.append([ref])
                            hypotheses.append(hypothesis)
Beispiel #22
0
    def __init__(self, is_training=True):
        self.graph = tf.Graph()

        with self.graph.as_default():
            if is_training:
                self.x, self.y, self.num_batch = get_batch_data()
            else:
                # x: (32,10)  y:(32,10)  一个batch32个句子,每个句子长度为10
                self.x = tf.placeholder(tf.int32, shape=(None, hp.maxlen))
                self.y = tf.placeholder(tf.int32, shape=(None, hp.maxlen))
            """
            定义decoder部分的input
            
             假设真实翻译后的输出为 i am a student </S>
             
             decoder部分的input应为: <S> i am a student
            """
            self.decoder_inputs = tf.concat(
                (tf.ones_like(self.y[:, :1]) * 2, self.y[:, :-1]),
                -1)  # 2代表<S>,是decoder的初始输入

            # 词典
            de2idx, idx2de = load_de_vocab()
            en2idx, idx2en = load_en_vocab()

            with tf.variable_scope("encoder"):
                # Embedding
                self.enc = embedding(
                    self.x,
                    vocab_size=len(de2idx),
                    num_units=hp.hidden_units,
                    zero_pad=True,  # 让padding一直是0
                    scale=True,
                    scope="enc_embed")

                ## Positional Encoding
                if hp.sinusoid:
                    self.enc += positional_encoding(self.x,
                                                    num_units=hp.hidden_units,
                                                    zero_pad=False,
                                                    scale=False,
                                                    scope='enc_pe')

                else:
                    self.enc += embedding(tf.tile(
                        tf.expand_dims(tf.range(tf.shape(self.x)[1]), 0),
                        [tf.shape(self.x)[0], 1]),
                                          vocab_size=hp.maxlen,
                                          num_units=hp.hidden_units,
                                          zero_pad=False,
                                          scale=False,
                                          scope="enc_pe")

                ##Drop out
                self.enc = tf.layers.dropout(
                    self.enc,
                    rate=hp.dropout_rate,
                    training=tf.convert_to_tensor(is_training))

                ## Blocks
                for i in range(hp.num_blocks):
                    with tf.variable_scope("num_blocks_{}".format(i)):
                        ### MultiHead Attention
                        self.enc = multihead_attention(
                            queries=self.enc,
                            keys=self.enc,
                            num_units=hp.hidden_units,
                            num_heads=hp.num_heads,
                            dropout_rate=hp.dropout_rate,
                            is_training=is_training,
                            causality=False)
                        self.enc = feedforward(
                            self.enc,
                            num_units=[4 * hp.hidden_units, hp.hidden_units])

            with tf.variable_scope("decoder"):
                # Embedding
                self.dec = embedding(self.decoder_inputs,
                                     vocab_size=len(en2idx),
                                     num_units=hp.hidden_units,
                                     scale=True,
                                     scope="dec_embed")

                ## Positional Encoding
                if hp.sinusoid:
                    self.dec += positional_encoding(self.decoder_inputs,
                                                    vocab_size=hp.maxlen,
                                                    num_units=hp.hidden_units,
                                                    zero_pad=False,
                                                    scale=False,
                                                    scope="dec_pe")
                else:
                    self.dec += embedding(tf.tile(
                        tf.expand_dims(
                            tf.range(tf.shape(self.decoder_inputs)[1]), 0),
                        [tf.shape(self.decoder_inputs)[0], 1]),
                                          vocab_size=hp.maxlen,
                                          num_units=hp.hidden_units,
                                          zero_pad=False,
                                          scale=False,
                                          scope="dec_pe")

                # Dropout
                self.dec = tf.layers.dropout(
                    self.dec,
                    rate=hp.dropout_rate,
                    training=tf.convert_to_tensor(is_training))

                ## Blocks
                for i in range(hp.num_blocks):
                    with tf.variable_scope("num_blocks_{}".format(i)):
                        ## Multihead Attention ( self-attention)
                        self.dec = multihead_attention(
                            queries=self.dec,
                            keys=self.dec,
                            num_units=hp.hidden_units,
                            num_heads=hp.num_heads,
                            dropout_rate=hp.dropout_rate,
                            is_training=is_training,
                            causality=True,
                            scope="self_attention")

                        ## Multihead Attention ( vanilla attention)
                        self.dec = multihead_attention(
                            queries=self.dec,
                            keys=self.enc,
                            num_units=hp.hidden_units,
                            num_heads=hp.num_heads,
                            dropout_rate=hp.dropout_rate,
                            is_training=is_training,
                            causality=False,
                            scope="vanilla_attention")

                        ## Feed Forward
                        self.dec = feedforward(
                            self.dec,
                            num_units=[4 * hp.hidden_units, hp.hidden_units])

            # Final linear projection
            self.logits = tf.layers.dense(self.dec, len(en2idx))
            self.preds = tf.to_int32(tf.argmax(self.logits, dimension=-1))
            self.istarget = tf.to_float(tf.not_equal(self.y, 0))
            self.acc = tf.reduce_sum(
                tf.to_float(tf.equal(self.preds, self.y)) * self.istarget /
                (tf.reduce_sum(self.istarget)))

            if is_training:
                # Loss
                # 将one_hot中的0改成了一个很小的数,1改成了一个比较接近于1的数。
                self.y_smoothed = label_smoothing(
                    tf.one_hot(self.y, depth=len(en2idx)))
                self.loss = tf.nn.softmax_cross_entropy_with_logits(
                    logits=self.logits, labels=self.y_smoothed)
                self.mean_loss = tf.reduce_sum(
                    self.loss * self.istarget) / (tf.reduce_sum(self.istarget))

                self.global_step = tf.Variable(0,
                                               name='global_step',
                                               trainable=False)
                self.optimizer = tf.train.AdamOptimizer(learning_rate=hp.lr,
                                                        beta1=0.9,
                                                        beta2=0.98,
                                                        epsilon=1e-8)
                self.train_op = self.optimizer.minimize(
                    self.mean_loss, global_step=self.global_step)

                tf.summary.scalar('mean_loss', self.mean_loss)
                self.merged = tf.summary.merge_all()
Beispiel #23
0
    def __init__(self, is_training=True):
        self.graph = tf.Graph()
        with self.graph.as_default():
            if is_training:
                self.x, self.y, self.num_batch = get_batch_data()  # (N, T)
            else:  # inference
                self.x = tf.placeholder(tf.int32, shape=(None, hp.maxlen))
                self.y = tf.placeholder(tf.int32, shape=(None, hp.maxlen))

            # define decoder inputs
            self.decoder_inputs = tf.concat(
                (tf.ones_like(self.y[:, :1]) * 2, self.y[:, :-1]), -1)  # 2:<S>

            # Load vocabulary
            de2idx, idx2de = load_de_vocab()
            en2idx, idx2en = load_en_vocab()

            # Encoder
            with tf.variable_scope("encoder"):
                ## Embedding
                self.enc = embedding(self.x,
                                     vocab_size=len(de2idx),
                                     num_units=hp.hidden_units,
                                     scale=True,
                                     scope="enc_embed")

                ## Positional Encoding
                if hp.sinusoid:
                    self.enc += positional_encoding(self.x,
                                                    num_units=hp.hidden_units,
                                                    zero_pad=False,
                                                    scale=False,
                                                    scope="enc_pe")
                else:
                    self.enc += embedding(tf.tile(
                        tf.expand_dims(tf.range(tf.shape(self.x)[1]), 0),
                        [tf.shape(self.x)[0], 1]),
                                          vocab_size=hp.maxlen,
                                          num_units=hp.hidden_units,
                                          zero_pad=False,
                                          scale=False,
                                          scope="enc_pe")

                ## Dropout
                self.enc = tf.layers.dropout(
                    self.enc,
                    rate=hp.dropout_rate,
                    training=tf.convert_to_tensor(is_training))

                ## Blocks
                for i in range(hp.num_blocks):
                    with tf.variable_scope("num_blocks_{}".format(i)):
                        ### Multihead Attention
                        self.enc = multihead_attention(
                            queries=self.enc,
                            keys=self.enc,
                            num_units=hp.hidden_units,
                            num_heads=hp.num_heads,
                            dropout_rate=hp.dropout_rate,
                            is_training=is_training,
                            causality=False)

                        ### Feed Forward
                        self.enc = feedforward(
                            self.enc,
                            num_units=[4 * hp.hidden_units, hp.hidden_units])

            # Decoder
            with tf.variable_scope("decoder"):
                ## Embedding
                self.dec = embedding(self.decoder_inputs,
                                     vocab_size=len(en2idx),
                                     num_units=hp.hidden_units,
                                     scale=True,
                                     scope="dec_embed")

                ## Positional Encoding
                if hp.sinusoid:
                    self.dec += positional_encoding(self.decoder_inputs,
                                                    num_units=hp.hidden_units,
                                                    zero_pad=False,
                                                    scale=False,
                                                    scope="dec_pe")
                else:
                    self.dec += embedding(tf.tile(
                        tf.expand_dims(
                            tf.range(tf.shape(self.decoder_inputs)[1]), 0),
                        [tf.shape(self.decoder_inputs)[0], 1]),
                                          vocab_size=hp.maxlen,
                                          num_units=hp.hidden_units,
                                          zero_pad=False,
                                          scale=False,
                                          scope="dec_pe")

                ## Dropout
                self.dec = tf.layers.dropout(
                    self.dec,
                    rate=hp.dropout_rate,
                    training=tf.convert_to_tensor(is_training))

                ## Blocks
                for i in range(hp.num_blocks):
                    with tf.variable_scope("num_blocks_{}".format(i)):
                        ## Multihead Attention ( self-attention)
                        self.dec = multihead_attention(
                            queries=self.dec,
                            keys=self.dec,
                            num_units=hp.hidden_units,
                            num_heads=hp.num_heads,
                            dropout_rate=hp.dropout_rate,
                            is_training=is_training,
                            causality=True,
                            scope="self_attention")

                        ## Multihead Attention ( vanilla attention)
                        self.dec = multihead_attention(
                            queries=self.dec,
                            keys=self.enc,
                            num_units=hp.hidden_units,
                            num_heads=hp.num_heads,
                            dropout_rate=hp.dropout_rate,
                            is_training=is_training,
                            causality=False,
                            scope="vanilla_attention")

                        ## Feed Forward
                        self.dec = feedforward(
                            self.dec,
                            num_units=[4 * hp.hidden_units, hp.hidden_units])

            # Final linear projection
            # 对最后一维做线性变换成词库这么长,对应每个单词的logits,然后将logits最大的索引记录下来,即预测值
            self.logits = tf.layers.dense(self.dec,
                                          len(en2idx))  #(N, T, vocab_len)
            self.preds = tf.to_int32(tf.arg_max(self.logits,
                                                dimension=-1))  # (N, T)
            # 把y中所有不是<PAD>出来的都由True转化为1.0
            self.istarget = tf.to_float(tf.not_equal(self.y, 0))
            # acc表示的是  (一个batch中所有的非<PAD>的单词,预测对的数量求和)/(一个batch中所有的非<PAD>单词数量)
            # tips:tf.reduce_sum()未指定axis,即把所有维度都加起来
            self.acc = tf.reduce_sum(
                tf.to_float(tf.equal(self.preds, self.y)) *
                self.istarget) / (tf.reduce_sum(self.istarget))
            # 计算acc给summary监督学习过程。
            tf.summary.scalar('acc', self.acc)

            if is_training:
                # Loss
                # tf.one_hot(tensor, int),构造一个len(tensor)*int的tensor,tensor的值变成索引,对应位置为1.,其他为0.
                # 如果索引值大于int大小,则整行都是0.
                self.y_smoothed = label_smoothing(
                    tf.one_hot(self.y, depth=len(
                        en2idx)))  #y_smoothed因为one_hot变成了(N, T, vocab_len)
                # tf.nn.softmax_cross_entropy_with_logits实际上做的事情是:
                # 1.先对logits求softmax   2.再将vocab_len上的分布和y_label做交叉熵,得到一个(N, T)的向量
                # 即每一单词有一个loss
                self.loss = tf.nn.softmax_cross_entropy_with_logits(
                    logits=self.logits, labels=self.y_smoothed)  # (N, T)
                # 将<PAD>出来的部分的loss去掉,再求mean_loss
                self.mean_loss = tf.reduce_sum(self.loss * self.istarget) / (
                    tf.reduce_sum(self.istarget))  #标量scale

                # Training Scheme
                self.global_step = tf.Variable(0,
                                               name='global_step',
                                               trainable=False)
                self.optimizer = tf.train.AdamOptimizer(learning_rate=hp.lr,
                                                        beta1=0.9,
                                                        beta2=0.98,
                                                        epsilon=1e-8)
                self.train_op = self.optimizer.minimize(
                    self.mean_loss, global_step=self.global_step)

                # Summary
                tf.summary.scalar('mean_loss', self.mean_loss)
                self.merged = tf.summary.merge_all()
Beispiel #24
0
def eval(hp):
    # Load graph
    g = Graph(hp=hp, is_training=False)
    print("Graph loaded")

    # Load data

    X, X_image, X_length, Y, Sources, Targets, X_turn_number, SRC_emotion, TGT_emotion, Speakers, A = load_test_data(
        hp)
    #print(X)
    de2idx, idx2de = load_de_vocab(hp)
    en2idx, idx2en = load_en_vocab(hp)

    # Start session
    with g.graph.as_default():
        sv = tf.train.Supervisor()
        with sv.managed_session(config=tf.ConfigProto(
                allow_soft_placement=True)) as sess:
            ## Restore parameters
            sv.saver.restore(sess, tf.train.latest_checkpoint(hp.logdir))
            print("Restored!")

            ## Get model name
            mname = open(hp.logdir + '/checkpoint',
                         'r').read().split('"')[1]  # model name
            #fftmp=open("tmp.txt","w")
            ## Inference
            if not os.path.exists('results'): os.mkdir('results')
            with codecs.open("results/" + mname, "w", "utf-8") as fout:
                list_of_refs, hypotheses, test_loss = [], [], []
                for i in range(len(X) // hp.batch_size):

                    ### Get mini-batches
                    x = X[i * hp.batch_size:(i + 1) * hp.batch_size]
                    x_length = X_length[i * hp.batch_size:(i + 1) *
                                        hp.batch_size]
                    y = Y[i * hp.batch_size:(i + 1) * hp.batch_size]
                    x_emotion = SRC_emotion[i * hp.batch_size:(i + 1) *
                                            hp.batch_size]
                    speaker = Speakers[i * hp.batch_size:(i + 1) *
                                       hp.batch_size]
                    x_image = X_image[i * hp.batch_size:(i + 1) *
                                      hp.batch_size]
                    a = A[i * hp.batch_size:(i + 1) * hp.batch_size]
                    sources = Sources[i * hp.batch_size:(i + 1) *
                                      hp.batch_size]
                    targets = Targets[i * hp.batch_size:(i + 1) *
                                      hp.batch_size]
                    eval_bath = sess.run(
                        g.mean_loss, {
                            g.x: x,
                            g.x_image: x_image,
                            g.x_length: x_length,
                            g.y: y,
                            g.x_emotion: x_emotion,
                            g.speaker: speaker,
                            g.A: a,
                            g.x_turn_number: x_turn_number
                        })
                    test_loss.append(eval_bath)

                    ### Autoregressive inference
                    preds = np.zeros((hp.batch_size, hp.maxlen), np.int32)
                    for j in range(hp.maxlen):
                        _preds = sess.run(g.preds, {
                            g.x: x,
                            g.x_length: x_length,
                            g.y: preds
                        })
                        preds[:, j] = _preds[:, j]

                    ### Write to file
                    for source, target, pred in zip(sources, targets,
                                                    preds):  # sentence-wise
                        got = " ".join(
                            idx2en[idx]
                            for idx in pred).split("</S>")[0].strip()
                        fout.write("- source: " + source + "\n")
                        fout.write("- expected: " + target + "\n")
                        fout.write("- got: " + got + "\n\n")
                        fout.flush()

                        # bleu score
                        #ref = target.split()
                        ref = target.split(u"</d>")[1].split()
                        hypothesis = got.split()
                        if len(ref) > 3 and len(hypothesis) > 3:
                            list_of_refs.append([ref])
                            hypotheses.append(hypothesis)

                ## Calculate bleu score
                score = corpus_bleu(list_of_refs, hypotheses)
                fout.write("Test Bleu Score = " + str(100 * score))
                print("Test Bleu Score = " + str(100 * score))
                print("eval PPL = %.5lf" %
                      (round(math.exp(np.mean(test_loss)), 4)))
                print("eval loss = %.5lf" % (np.mean(test_loss)))
                # Distinct-1, Distinct-2
                candidates = []
                for line in hypotheses:
                    candidates.extend(line)
                distinct_1, distinct_2 = cal_Distinct(candidates)
                print('Distinct-1:' + str(round(distinct_1, 4)) +
                      'Distinct-2:' + str(round(distinct_2, 4)))
                                               trainable=False)
                self.optimizer = tf.train.AdamOptimizer(learning_rate=hp.lr,
                                                        beta1=0.9,
                                                        beta2=0.98,
                                                        epsilon=1e-8)
                self.train_op = self.optimizer.minimize(
                    self.mean_loss, global_step=self.global_step)

                # Summary
                tf.summary.scalar('mean_loss', self.mean_loss)
                self.merged = tf.summary.merge_all()


if __name__ == '__main__':
    # Load vocabulary
    de2idx, idx2de = load_de_vocab()
    en2idx, idx2en = load_en_vocab()

    # Construct graph
    g = Graph("train")
    print("Graph loaded")

    # Start session
    sv = tf.train.Supervisor(graph=g.graph,
                             logdir=hp.logdir,
                             save_model_secs=0)
    with sv.managed_session() as sess:
        for epoch in range(1, hp.num_epochs + 1):
            if sv.should_stop(): break
            for step in tqdm(range(g.num_batch),
                             total=g.num_batch,
Beispiel #26
0
def eval(): 
    # Load graph
    g = Graph(is_training=False)
    print("Graph loaded")
    
    # Load data
    X, Sources, Targets = load_test_data()
    de2idx, idx2de = load_de_vocab()
    en2idx, idx2en = load_en_vocab()
#     X, Sources, Targets = X[:33], Sources[:33], Targets[:33]
    gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.5)
    # Start session         
    with g.graph.as_default():    
        sv = tf.train.Supervisor()
        with sv.managed_session(config=tf.ConfigProto(allow_soft_placement=True, gpu_options=gpu_options)) as sess:
            ## Restore parameters
            sv.saver.restore(sess, tf.train.latest_checkpoint(hp.logdir))
            print("Restored!")
              
            ## Get model name
            mname = open(hp.logdir + '/checkpoint', 'r').read().split('"')[1] # model name
            ## Inference
            totalTransNum = 0
            if not os.path.exists('results'): os.mkdir('results')
            with codecs.open('results/'+mname+'.trans', 'w', 'utf8') as tfout:
                with codecs.open("results/" + mname, "w", "utf-8") as fout:
                    list_of_refs, hypotheses = [], []
                    for i in range((len(X) // hp.batch_size) + 1):
                        ### Get mini-batches
                        batchEnd = (i+1)*hp.batch_size
                        readlBatchSize = hp.batch_size
                        if batchEnd > len(X):
                            readlBatchSize = hp.batch_size - (batchEnd - len(X))
                            batchEnd = len(X)

                        x = X[i*hp.batch_size: batchEnd]
                        sources = Sources[i*hp.batch_size: batchEnd]
                        targets = Targets[i*hp.batch_size: batchEnd]
                        totalTransNum += len(sources)
                        ### Autoregressive inference
                        preds = np.zeros((readlBatchSize, hp.maxlen), np.int32)
                        for j in range(hp.maxlen):
                            _preds = sess.run(g.preds, {g.x: x, g.y: preds})
                            preds[:, j] = _preds[:, j]

                        ### Write to file
                        for source, target, pred in zip(sources, targets, preds): # sentence-wise
                            got = " ".join(idx2en[idx] for idx in pred).split("</S>")[0].strip()
                            fout.write("- source: " + source +"\n")
                            fout.write("- expected: " + target + "\n")
                            fout.write("- got: " + got + "\n\n")
                            tfout.write(got)
                            tfout.write('\n')

                            # bleu score
                            ref = target.split()
                            hypothesis = got.split()
                            if len(ref) > 3 and len(hypothesis) > 3:
                                list_of_refs.append([ref])
                                hypotheses.append(hypothesis)

                    ## Calculate bleu score
                    score = corpus_bleu(list_of_refs, hypotheses)
                    fout.write("Bleu Score = " + str(100*score))
                    fout.write('\n')

                    print('totalTransNum', totalTransNum, 'Bleu', str(100*score))
Beispiel #27
0
    def __init__(self, is_training):
        self.is_training = is_training
        self.graph = tf.Graph()
        with self.graph.as_default():
            self._selector = True
            self.x = tf.placeholder(tf.int32, shape=(None, hp.maxlen))
            self.x_target = tf.placeholder(tf.int32, shape=(None, hp.maxlen))

            self.image = tf.placeholder(tf.float32, shape=[None, 196, 1024])
            self.dropout_rate = tf.placeholder(tf.float32)
            self.lstm_drop_rate = tf.placeholder(tf.float32)
            self.lr = tf.placeholder(tf.float32, shape=[])
            batch_size = tf.shape(self.image)[0]
            de2idx, idx2de = load_de_vocab()
            en2idx, idx2en = load_en_vocab()
            self.batch_size = batch_size
            self.en2idx = en2idx
            self.de2idx = de2idx
            self.weight_initializer = tf.contrib.layers.xavier_initializer()
            self.istarget = tf.to_float(tf.not_equal(self.x_target, 0))
            with tf.variable_scope("en_caption"):

                with tf.variable_scope("embedding"):
                    self.lookup_table = tf.get_variable(
                        'lookup_table',
                        dtype=tf.float32,
                        shape=[len(self.en2idx), hp.hidden_units_cap],
                        initializer=tf.random_uniform_initializer(minval=-1.0,
                                                                  maxval=1.0))
                with tf.variable_scope("lstm"):
                    lstm_cell = tf.nn.rnn_cell.LSTMCell(hp.lstm_units)

                    lstm = tf.nn.rnn_cell.DropoutWrapper(
                        lstm_cell,
                        input_keep_prob=1.0 - self.lstm_drop_rate,
                        output_keep_prob=1.0 - self.lstm_drop_rate)
                    self.lstm = lstm
                self.feature = tf.contrib.layers.batch_norm(
                    inputs=self.image,
                    decay=0.95,
                    center=True,
                    scale=True,
                    updates_collections=None,
                    is_training=False)  #self.is_training
                with tf.variable_scope("initialize"):
                    context_mean = tf.reduce_mean(self.feature, axis=1)
                    initial_memory, initial_output = self.initial(context_mean)
                    initial_state = initial_memory, initial_output
                last_state = initial_state
                last_output = initial_output
                self.last_state, self.last_output = initial_state, initial_output
                logit_list, self.preds_list, alpha_list = [], [], []
                sentence = tf.nn.embedding_lookup(
                    self.lookup_table,
                    tf.ones(batch_size, dtype=tf.int32) * 2)
                if not is_training:
                    beam_width = 5
                    self.feature = tf.tile(
                        tf.expand_dims(self.feature, axis=1),
                        [1, beam_width, 1, 1])
                    self.preds = self.beam_search(sentence,
                                                  beam_width=beam_width,
                                                  num_classes=len(en2idx))
                else:
                    for i in range(hp.maxlen):
                        #batch_size x embed_dim
                        alpha = self.attention(last_output)  #batch_size x 196
                        mask_alpha = tf.tile(
                            tf.expand_dims(self.istarget[:, i], 1), [1, 196])
                        alpha_list.append(alpha * mask_alpha)

                        image_attention = tf.reduce_sum(
                            self.feature * tf.expand_dims(alpha, 2),
                            axis=1)  #batch_size x 1024
                        if self._selector:
                            image_attention = self.selector(
                                image_attention, last_output)
                        inputs = tf.concat((image_attention, sentence), axis=1)
                        output, state = lstm(inputs, last_state)
                        #!!
                        temp = tf.layers.dropout(output,
                                                 rate=self.dropout_rate)
                        expanded_output = tf.concat(
                            [temp, sentence, image_attention], axis=1)
                        logits = self.decode(expanded_output)
                        prediction = tf.argmax(logits, 1)
                        self.preds_list.append(prediction)
                        logit_list.append(logits)
                        sentence = tf.nn.embedding_lookup(
                            self.lookup_table, self.x[:, i])
                        last_state = state
                        last_output = output
            if is_training:
                self.preds_list = tf.stack(self.preds_list, axis=1)
                logits = tf.stack(logit_list, axis=1)
                alpha_list = tf.stack(alpha_list, axis=1)
                attentions = tf.reduce_sum(alpha_list, axis=1)
                diffs = tf.ones_like(attentions) - attentions
                attention_loss = hp.attention_loss_factor \
                                 * tf.nn.l2_loss(diffs) \
                                 / tf.cast((batch_size * 196),dtype=tf.float32)
                self.loss = tf.nn.softmax_cross_entropy_with_logits_v2(
                    labels=tf.one_hot(self.x_target, len(en2idx)),
                    logits=logits)
                self.loss = tf.reduce_sum(
                    self.loss * self.istarget) / tf.reduce_sum(
                        self.istarget) + attention_loss
                self.optimizer = tf.train.AdamOptimizer(learning_rate=self.lr,
                                                        beta1=0.9,
                                                        beta2=0.98,
                                                        epsilon=1e-9)
                self.global_step = tf.Variable(0,
                                               name='global_step',
                                               trainable=False)
                self.train_op = self.optimizer.minimize(
                    self.loss, global_step=self.global_step)
                self.train_op = tf.contrib.layers.optimize_loss(
                    loss=self.loss,
                    global_step=self.global_step,
                    learning_rate=self.lr,
                    optimizer=self.optimizer,
                    clip_gradients=hp.clip_gradients)
            self.value_list = slim.get_variables_to_restore()
    def build_network(self):
        #import ipdb; ipdb.set_trace()
        config = self.config
        de2idx, idx2de = load_de_vocab()
        en2idx, idx2en = load_en_vocab()

        # Encoder
        with tf.variable_scope("encoder"):
            ## Embedding
            self.enc = embedding(self.x,
                                 len(de2idx),
                                 num_units=config.hidden_dim,
                                 scale=True,
                                 scope='enc_embed')

            ## plus position embedding
            self.enc += embedding(tf.tile(tf.expand_dims(tf.range(tf.shape(self.x)[1]), 0), \
                                            [tf.shape(self.x)[0], 1]),
                                config.maxlen,
                                config.hidden_dim,
                                zero_pad=False,
                                scale=False,
                                scope="enc_pe")

            self.enc = dropout(self.enc,
                               config.keep_rate,
                               is_train=self.is_train)

            self.enc_ = self.enc
            for block_idx in range(config.num_enc_block_1):
                scope = "encoder_block_{}".format(block_idx)
                enc_out = conv2d(self.enc,
                                 kernel_shape=(config.enc_kernel_width, 1),
                                 scope=scope)
                enc_out = batch_norm(enc_out,
                                     is_training=self.is_train,
                                     scope="lm" + scope)
                self.enc = enc_out

        # Decoder
        with tf.variable_scope("decoder"):
            ## Embedding
            self.dec = embedding(self.decode_input,
                                 len(en2idx),
                                 config.hidden_dim,
                                 scale=True,
                                 scope='dec_embed')
            ## plus position embedding
            self.dec += embedding(tf.tile(tf.expand_dims(tf.range(tf.shape(self.decode_input)[1]), 0), \
                                            [tf.shape(self.decode_input)[0], 1]),
                                config.maxlen,
                                config.hidden_dim,
                                zero_pad=False,
                                scale=False,
                                scope='dec_pe')

            self.dec_ = self.dec
            for block_idx in range(config.num_dec_block_1):
                scope = "decoder_block_conv_{}".format(block_idx)
                attention_scope = "decoder_block_att_{}".format(block_idx)
                dec_out = conv2d(self.dec,
                                 kernel_shape=(config.dec_kernel_width, 1),
                                 causal=True,
                                 scope=scope)
                dec_out = attention_pool(self.enc_,
                                         self.dec,
                                         enc_out,
                                         dec_out,
                                         scope=attention_scope)
                dec_out = dec_out + self.dec
                dec_out = batch_norm(dec_out,
                                     is_training=self.is_train,
                                     scope="lm" + scope)
                self.dec = dec_out

        with tf.variable_scope('encoder'):
            for block_idx in range(config.num_enc_block_2):
                scope = "encoder_block_{}".format(config.num_enc_block_1 +
                                                  block_idx)
                enc_out = conv2d(self.enc,
                                 kernel_shape=(config.enc_kernel_width, 1),
                                 num_outputs=config.hidden_dim_2,
                                 scope=scope)
                enc_out = batch_norm(enc_out,
                                     is_training=self.is_train,
                                     scope="lm" + scope)
                self.enc = enc_out

        with tf.variable_scope('decoder'):
            for block_idx in range(config.num_dec_block_2):
                scope = "decoder_block_conv_{}".format(config.num_dec_block_1 +
                                                       block_idx)
                attention_scope = "decoder_block_att_{}".format(
                    config.num_dec_block_1 + block_idx)
                dec_out = conv2d(self.dec,
                                 kernel_shape=(config.dec_kernel_width, 1),
                                 num_outputs=config.hidden_dim_2,
                                 causal=True,
                                 scope=scope)
                dec_out = attention_pool(self.enc_,
                                         self.dec,
                                         enc_out,
                                         dec_out,
                                         scope=attention_scope)
                dec_out = dec_out + self.dec
                dec_out = batch_norm(dec_out,
                                     is_training=self.is_train,
                                     scope="lm" + scope)
                self.dec = dec_out

        with tf.variable_scope("softmax_layer"):
            w = tf.get_variable('w', [config.hidden_dim, len(en2idx)])
            b = tf.get_variable('b', [len(en2idx)])
            w = tf.tile(tf.expand_dims(w, 0), [config.batch_size, 1, 1])
            self.logits = tf.matmul(dec_out, w) + b
            self.preds = tf.to_int32(tf.arg_max(self.logits, dimension=-1))
            self.istarget = tf.to_float(tf.not_equal(self.y, 0))
            self.acc = tf.reduce_sum(
                tf.to_float(tf.equal(self.preds, self.y)) *
                self.istarget) / tf.reduce_sum(self.istarget)
            tf.summary.scalar('acc', self.acc)

            if self.is_train:
                self.y_smoothed = label_smoothing(
                    tf.one_hot(self.y, depth=len(en2idx)))
                self.loss = tf.nn.softmax_cross_entropy_with_logits(
                    logits=self.logits, labels=self.y_smoothed)
                self.mean_loss = tf.reduce_mean(self.loss)
                tf.summary.scalar('mean_loss', self.mean_loss)

        self.tensors = {
            'source_sentence': self.enc_,
            'target_sentence': self.dec_,
            'enc_out': enc_out,
            'dec_out': dec_out,
            'predictions': self.preds,
            'logits': self.logits
        }
        if self.is_train:
            self.tensors['loss'] = self.loss

        for key, value in self.tensors.items():
            tf.summary.histogram(key, value)
Beispiel #29
0
    def __init__(self, is_training=True):
        self.graph = tf.Graph()
        with self.graph.as_default():
            if is_training:
                self.x, self.y, self.num_batch = get_batch_data() # (N, T)
            else: # inference
                self.x = tf.placeholder(tf.int32, shape=(None, hp.maxlen))
                self.y = tf.placeholder(tf.int32, shape=(None, hp.maxlen))

            # define decoder inputs
            self.decoder_inputs = tf.concat((tf.ones_like(self.y[:, :1])*2, self.y[:, :-1]), -1) # 2:<S>

            # Load vocabulary    
            de2idx, idx2de = load_de_vocab()
            en2idx, idx2en = load_en_vocab()
            
            # Encoder
            with tf.variable_scope("encoder"):
                ## Embedding
                self.enc = embedding(self.x, 
                                      vocab_size=len(de2idx), 
                                      num_units=hp.hidden_units, 
                                      scale=True,
                                      scope="enc_embed")
                
                ## Positional Encoding
                if hp.sinusoid:
                    self.enc += positional_encoding(self.x,
                                      num_units=hp.hidden_units, 
                                      zero_pad=False, 
                                      scale=False,
                                      scope="enc_pe")
                else:
                    self.enc += embedding(tf.tile(tf.expand_dims(tf.range(tf.shape(self.x)[1]), 0), [tf.shape(self.x)[0], 1]),
                                      vocab_size=hp.maxlen, 
                                      num_units=hp.hidden_units, 
                                      zero_pad=False, 
                                      scale=False,
                                      scope="enc_pe")
                    
                 
                ## Dropout
                self.enc = tf.layers.dropout(self.enc, 
                                            rate=hp.dropout_rate, 
                                            training=tf.convert_to_tensor(is_training))
                
                ## Blocks
                for i in range(hp.num_blocks):
                    with tf.variable_scope("num_blocks_{}".format(i)):
                        ### Multihead Attention
                        self.enc = multihead_attention(queries=self.enc, 
                                                        keys=self.enc, 
                                                        num_units=hp.hidden_units, 
                                                        num_heads=hp.num_heads, 
                                                        dropout_rate=hp.dropout_rate,
                                                        is_training=is_training,
                                                        causality=False)
                        
                        ### Feed Forward
                        self.enc = feedforward(self.enc, num_units=[4*hp.hidden_units, hp.hidden_units])
            
            # Decoder
            with tf.variable_scope("decoder"):
                ## Embedding
                self.dec = embedding(self.decoder_inputs, 
                                      vocab_size=len(en2idx), 
                                      num_units=hp.hidden_units,
                                      scale=True, 
                                      scope="dec_embed")
                
                ## Positional Encoding
                if hp.sinusoid:
                    self.dec += positional_encoding(self.decoder_inputs,
                                      vocab_size=hp.maxlen, 
                                      num_units=hp.hidden_units, 
                                      zero_pad=False, 
                                      scale=False,
                                      scope="dec_pe")
                else:
                    self.dec += embedding(tf.tile(tf.expand_dims(tf.range(tf.shape(self.decoder_inputs)[1]), 0), [tf.shape(self.decoder_inputs)[0], 1]),
                                      vocab_size=hp.maxlen, 
                                      num_units=hp.hidden_units, 
                                      zero_pad=False, 
                                      scale=False,
                                      scope="dec_pe")
                
                ## Dropout
                self.dec = tf.layers.dropout(self.dec, 
                                            rate=hp.dropout_rate, 
                                            training=tf.convert_to_tensor(is_training))
                
                ## Blocks
                for i in range(hp.num_blocks):
                    with tf.variable_scope("num_blocks_{}".format(i)):
                        ## Multihead Attention ( self-attention)
                        self.dec = multihead_attention(queries=self.dec, 
                                                        keys=self.dec, 
                                                        num_units=hp.hidden_units, 
                                                        num_heads=hp.num_heads, 
                                                        dropout_rate=hp.dropout_rate,
                                                        is_training=is_training,
                                                        causality=True, 
                                                        scope="self_attention")
                        
                        ## Multihead Attention ( vanilla attention)
                        self.dec = multihead_attention(queries=self.dec, 
                                                        keys=self.enc, 
                                                        num_units=hp.hidden_units, 
                                                        num_heads=hp.num_heads,
                                                        dropout_rate=hp.dropout_rate,
                                                        is_training=is_training, 
                                                        causality=False,
                                                        scope="vanilla_attention")
                        
                        ## Feed Forward
                        self.dec = feedforward(self.dec, num_units=[4*hp.hidden_units, hp.hidden_units])
                
            # Final linear projection
            self.logits = tf.layers.dense(self.dec, len(en2idx))
            self.preds = tf.to_int32(tf.arg_max(self.logits, dimension=-1))
            self.istarget = tf.to_float(tf.not_equal(self.y, 0))
            self.acc = tf.reduce_sum(tf.to_float(tf.equal(self.preds, self.y))*self.istarget)/ (tf.reduce_sum(self.istarget))
            tf.summary.scalar('acc', self.acc)
                
            if is_training:  
                # Loss
                self.y_smoothed = label_smoothing(tf.one_hot(self.y, depth=len(en2idx)))
                self.loss = tf.nn.softmax_cross_entropy_with_logits(logits=self.logits, labels=self.y_smoothed)
                self.mean_loss = tf.reduce_sum(self.loss*self.istarget) / (tf.reduce_sum(self.istarget))
               
                # Training Scheme
                self.global_step = tf.Variable(0, name='global_step', trainable=False)
                self.optimizer = tf.train.AdamOptimizer(learning_rate=hp.lr, beta1=0.9, beta2=0.98, epsilon=1e-8)
                self.train_op = self.optimizer.minimize(self.mean_loss, global_step=self.global_step)
                   
                # Summary 
                tf.summary.scalar('mean_loss', self.mean_loss)
                self.merged = tf.summary.merge_all()
    def __init__(self, is_training=True):
        self.graph = tf.Graph()
        with self.graph.as_default():
            if is_training:
                self.x, self.y, self.num_batch = get_batch_data()  # (N, T)
            else:  # inference
                self.x = tf.placeholder(tf.int32, shape=(None, hp.maxlen))
                self.y = tf.placeholder(tf.int32, shape=(None, hp.maxlen))

            # define decoder inputs
            self.decoder_inputs = tf.concat(
                (tf.ones_like(self.y[:, :1]) * 2, self.y[:, :-1]), -1)  # 2:<S>

            # Load vocabulary
            de2idx, idx2de = load_de_vocab()
            en2idx, idx2en = load_en_vocab()

            # Encoder
            with tf.variable_scope("encoder"):
                ## Embedding
                self.enc = embedding(self.x,
                                     vocab_size=len(de2idx),
                                     num_units=hp.hidden_units,
                                     scale=True,
                                     scope="enc_embed")

                key_masks = tf.expand_dims(
                    tf.sign(tf.reduce_sum(tf.abs(self.enc), axis=-1)), -1)

                ## Positional Encoding
                if hp.sinusoid:
                    self.enc += positional_encoding(self.x,
                                                    num_units=hp.hidden_units,
                                                    zero_pad=False,
                                                    scale=False,
                                                    scope="enc_pe")
                else:
                    self.enc += embedding(tf.tile(
                        tf.expand_dims(tf.range(tf.shape(self.x)[1]), 0),
                        [tf.shape(self.x)[0], 1]),
                                          vocab_size=hp.maxlen,
                                          num_units=hp.hidden_units,
                                          zero_pad=False,
                                          scale=False,
                                          scope="enc_pe")

                self.enc *= key_masks

                ## Dropout
                self.enc = tf.layers.dropout(
                    self.enc,
                    rate=hp.dropout_rate,
                    training=tf.convert_to_tensor(is_training))

                ## Blocks
                for i in range(hp.num_blocks):
                    with tf.variable_scope("num_blocks_{}".format(i)):
                        ### Multihead Attention
                        self.enc = multihead_attention(
                            queries=self.enc,
                            keys=self.enc,
                            num_units=hp.hidden_units,
                            num_heads=hp.num_heads,
                            dropout_rate=hp.dropout_rate,
                            is_training=is_training,
                            causality=False)

                        ### Feed Forward
                        self.enc = feedforward(
                            self.enc,
                            num_units=[4 * hp.hidden_units, hp.hidden_units])

            # Decoder
            with tf.variable_scope("decoder"):
                ## Embedding
                self.dec = embedding(self.decoder_inputs,
                                     vocab_size=len(en2idx),
                                     num_units=hp.hidden_units,
                                     scale=True,
                                     scope="dec_embed")

                key_masks = tf.expand_dims(
                    tf.sign(tf.reduce_sum(tf.abs(self.dec), axis=-1)), -1)

                ## Positional Encoding
                if hp.sinusoid:
                    self.dec += positional_encoding(self.decoder_inputs,
                                                    vocab_size=hp.maxlen,
                                                    num_units=hp.hidden_units,
                                                    zero_pad=False,
                                                    scale=False,
                                                    scope="dec_pe")
                else:
                    self.dec += embedding(tf.tile(
                        tf.expand_dims(
                            tf.range(tf.shape(self.decoder_inputs)[1]), 0),
                        [tf.shape(self.decoder_inputs)[0], 1]),
                                          vocab_size=hp.maxlen,
                                          num_units=hp.hidden_units,
                                          zero_pad=False,
                                          scale=False,
                                          scope="dec_pe")
                self.dec *= key_masks

                ## Dropout
                self.dec = tf.layers.dropout(
                    self.dec,
                    rate=hp.dropout_rate,
                    training=tf.convert_to_tensor(is_training))

                ## Blocks
                for i in range(hp.num_blocks):
                    with tf.variable_scope("num_blocks_{}".format(i)):
                        ## Multihead Attention ( self-attention)
                        self.dec = multihead_attention(
                            queries=self.dec,
                            keys=self.dec,
                            num_units=hp.hidden_units,
                            num_heads=hp.num_heads,
                            dropout_rate=hp.dropout_rate,
                            is_training=is_training,
                            causality=True,
                            scope="self_attention")

                        ## Multihead Attention ( vanilla attention)
                        self.dec = multihead_attention(
                            queries=self.dec,
                            keys=self.enc,
                            num_units=hp.hidden_units,
                            num_heads=hp.num_heads,
                            dropout_rate=hp.dropout_rate,
                            is_training=is_training,
                            causality=False,
                            scope="vanilla_attention")

                        ## Feed Forward
                        self.dec = feedforward(
                            self.dec,
                            num_units=[4 * hp.hidden_units, hp.hidden_units])

            # Final linear projection
            self.logits = tf.layers.dense(self.dec, len(en2idx))
            self.preds = tf.to_int32(tf.arg_max(self.logits, dimension=-1))
            self.istarget = tf.to_float(tf.not_equal(self.y, 0))
            self.acc = tf.reduce_sum(
                tf.to_float(tf.equal(self.preds, self.y)) *
                self.istarget) / (tf.reduce_sum(self.istarget))
            tf.summary.scalar('acc', self.acc)

            if is_training:
                # Loss
                self.y_smoothed = label_smoothing(
                    tf.one_hot(self.y, depth=len(en2idx)))
                self.loss = tf.nn.softmax_cross_entropy_with_logits(
                    logits=self.logits, labels=self.y_smoothed)
                self.mean_loss = tf.reduce_sum(
                    self.loss * self.istarget) / (tf.reduce_sum(self.istarget))

                # Training Scheme
                self.global_step = tf.Variable(0,
                                               name='global_step',
                                               trainable=False)
                self.optimizer = tf.train.AdamOptimizer(learning_rate=hp.lr,
                                                        beta1=0.9,
                                                        beta2=0.98,
                                                        epsilon=1e-8)
                self.train_op = self.optimizer.minimize(
                    self.mean_loss, global_step=self.global_step)

                # Summary
                tf.summary.scalar('mean_loss', self.mean_loss)
                self.merged = tf.summary.merge_all()
Beispiel #31
0
def eval():
    # Load graph
    g = Graph(is_training=False)
    print("Graph loaded")

    # Load data
    X, Sources, Targets = load_test_data()
    de2idx, idx2de = load_de_vocab()
    en2idx, idx2en = load_en_vocab()

    #     X, Sources, Targets = X[:33], Sources[:33], Targets[:33]

    # Start session
    with g.graph.as_default():
        sv = tf.train.Supervisor()
        gpu_options = tf.GPUOptions(allow_growth=True)
        with sv.managed_session(config=tf.ConfigProto(
                gpu_options=gpu_options)) as sess:
            ## Restore parameters
            sv.saver.restore(sess, tf.train.latest_checkpoint(hp.logdir))
            print("Restored!")

            ## Get model name
            mname = open(hp.logdir + '/checkpoint',
                         'r').read().split('"')[1]  # model name

            ## Inference
            if not os.path.exists('results'): os.mkdir('results')
            with codecs.open("results/" + mname, "w", "utf-8") as fout:
                list_of_refs, hypotheses = [], []
                for i in range(len(X) // hp.batch_size):

                    ### Get mini-batches 切片得到batch
                    x = X[i * hp.batch_size:(i + 1) * hp.batch_size]
                    sources = Sources[i * hp.batch_size:(i + 1) *
                                      hp.batch_size]
                    targets = Targets[i * hp.batch_size:(i + 1) *
                                      hp.batch_size]

                    ### Autoregressive inference
                    preds = np.zeros((hp.batch_size, hp.maxlen), np.int32)
                    for j in range(hp.maxlen):
                        # 通过网络预测g.preds,feed_dict的g.y,是之前定义的全为0的preds
                        # 每次预测batch中所有句子的一个单词
                        # 因为multi-attention有各种mask存在,所以当预测y的第i个单词时,self-attentuon不会受后面单词的影响(seq-mask)
                        #                                       同时decoder-encoder-attention不会受0 <PAD>标记影响(query-mask)
                        # 所以可以一个一个单词训练。
                        _preds = sess.run(g.preds, {g.x: x, g.y: preds})
                        preds[:, j] = _preds[:, j]

                    ### Write to file
                    # 通过zip把batch中的一个句子的source, target, pred取出来
                    for source, target, pred in zip(sources, targets,
                                                    preds):  # sentence-wise
                        # " ".join获得整个句子,在</S>前的留下
                        got = " ".join(
                            idx2en[idx]
                            for idx in pred).split("</S>")[0].strip()
                        print(got)
                        fout.write("- source: " + source + "\n")
                        fout.write("- expected: " + target + "\n")
                        fout.write("- got: " + got + "\n\n")
                        fout.flush()

                        # bleu score
                        ref = target.split()
                        hypothesis = got.split()
                        #总长小于3的句子不计算bleu,因为bleu对短的句子得分很高。
                        if len(ref) > 3 and len(hypothesis) > 3:
                            list_of_refs.append([ref])
                            hypotheses.append(hypothesis)

                ## Calculate bleu score
                # list_of_refs的形状为  所有长度大于3的句子长度 * 1 * 该句句子长度
                # 没有batch的信息,因为batch只是一个训练参数
                score = corpus_bleu(list_of_refs, hypotheses)
                fout.write("Bleu Score = " + str(100 * score))