Example #1
0
 def test_get_batch_data(self):
     data = np.array([[12, 10, 10, 0], [12, 8, 7, 0], [13, 4, 11, 0],
                      [8, 13, 9, 0], [7, 5, 11, 0], [0, 12, 12, 0],
                      [0, 10, 0, 0], [0, 0, 0, 0]])
     seq_len = np.array([6, 8, 7, 0])
     features = dstruct.Seq2SeqFeatureTuple(*(data, seq_len, None, None))
     labels = dstruct.SeqLabelTuple(*(None, None, None))
     batch = dstruct.BatchTuple(features, labels, None, False)
     new_batch = generator.get_batch_data(
         batch,
         data,
         start_id=1,
         seq_len_idx=1,
         input_key='dec_inputs',
         seq_len_key='dec_seq_len',
         unmasked_token_weight=np.ones_like(data) * 2)
     _f, _l, _n, _k = new_batch
     self.assertIs(data, _f.enc_inputs, 'enc data is the same object.')
     self.assertIs(seq_len, _f.enc_seq_len,
                   'enc seq len is the same object.')
     np.testing.assert_array_equal(data[:-1, :],
                                   _f.dec_inputs[1:, :],
                                   err_msg='dec input is shifted data.')
     np.testing.assert_array_equal(data,
                                   _l.label,
                                   err_msg='dec output is data.')
     np.testing.assert_array_equal(seq_len,
                                   _f.dec_seq_len,
                                   err_msg='dec seq len is correct.')
     self.assertEqual(sum(seq_len), _n, 'num tokens is correct.')
     w = np.array([[1, 1, 1, 0], [1, 1, 1, 0], [1, 1, 1, 0], [1, 1, 1, 0],
                   [1, 1, 1, 0], [1, 1, 1, 0], [0, 1, 1, 0], [0, 1, 0, 0]])
     np.testing.assert_array_equal(w * 2,
                                   new_batch.labels.label_weight,
                                   err_msg='token weight')
Example #2
0
def get_batch_data(batch,
                   y_arr,
                   unmasked_token_weight=None,
                   unmasked_seq_weight=None,
                   start_id=1,
                   seq_len_idx=1,
                   input_key='inputs',
                   seq_len_key='seq_len'):
    y_len = np.argmin(y_arr, axis=0) + 1
    y_len[batch.features[seq_len_idx] <= 0] = 0
    seq_weight = np.where(y_len > 0, 1, 0).astype(np.float32)
    if unmasked_seq_weight is not None:
        seq_weight *= unmasked_seq_weight
    token_weight, num_tokens = util.masked_full_like(y_arr, 1, y_len)
    if unmasked_token_weight is not None:
        token_weight *= unmasked_token_weight
    start = np.full((1, len(y_len)), start_id, dtype=np.int32) * seq_weight
    x_arr = np.vstack((start.astype(np.int32), y_arr))[:-1, :]
    features = batch.features._replace(**{
        input_key: x_arr,
        seq_len_key: y_len
    })
    labels = ds.SeqLabelTuple(y_arr, token_weight, seq_weight)
    batch = ds.BatchTuple(features, labels, num_tokens, batch.keep_state)
    return batch
Example #3
0
 def test_reward_match_label(self):
     data = np.array([[12, 10, 10, 0], [12, 8, 7, 0], [13, 4, 11, 0],
                      [8, 13, 9, 0], [7, 5, 11, 0], [0, 12, 12, 0],
                      [0, 10, 0, 0], [0, 0, 0, 0]])
     features = dstruct.Seq2SeqFeatureTuple(*(None, None, None, None))
     labels = dstruct.SeqLabelTuple(*(data, None, None))
     batch = dstruct.BatchTuple(features, labels, None, False)
     sample = np.array([[12, 10, 10, 0], [12, 8, 7, 0], [11, 4, 11, 0],
                        [8, 13, 9, 0], [7, 5, 11, 0], [0, 12, 12, 0],
                        [0, 1, 0, 0], [0, 1, 0, 0], [0, 0, 0, 0]])
     exact_match = np.array([[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0],
                             [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0],
                             [0, 0, 1, 0], [0, 0, 0, 0], [0, 0, 0, 0]])
     parti_match = np.array([[1, 1, 1, 0], [1, 1, 1, 0], [0, 1, 1, 0],
                             [1, 1, 1, 0], [1, 1, 1, 0], [1, 1, 1, 0],
                             [0, 0, 1, 0], [0, 0, 0, 0], [0, 0, 0, 0]])
     sample_len = np.array([6., 9., 7., 0.])
     _sample_len = np.array([6., 9., 7., 1.])  # for division
     m, avg = generator.reward_match_label(sample,
                                           batch,
                                           partial_match=False)
     np.testing.assert_array_equal(m, exact_match,
                                   'label exact match reward')
     self.assertEqual(avg, 1 / 3, 'average correct')
     m, avg = generator.reward_match_label(sample,
                                           batch,
                                           partial_match=True)
     np.testing.assert_array_equal(m, parti_match / _sample_len,
                                   'label match reward')
     self.assertEqual(
         avg,
         np.sum(parti_match / _sample_len) / np.sum(sample_len > 0),
         'average correct')
Example #4
0
def lseq2seq_batch_iter(enc_data,
                        dec_data,
                        label_data,
                        mask_data,
                        batch_size=1,
                        shuffle=True):
    """same as seq2seq_batch_iter, just add label"""
    data_tuple = (enc_data, dec_data, label_data, mask_data)
    for x, y, L, M in batch_iter(batch_size,
                                 shuffle,
                                 *data_tuple,
                                 pad=[[], [], 0, 2]):
        enc, enc_len = util.hstack_list(x)
        dec, dec_len = util.hstack_list(y)
        label = np.array(L, dtype=np.int32)
        mask = np.array(M, dtype=np.int32)
        in_dec = dec[:-1, :]
        out_dec = dec[1:, :]
        seq_weight = np.where(dec_len > 0, 1, 0)
        dec_len -= seq_weight
        token_weight, num_tokens = util.masked_full_like(
            out_dec, 1, num_non_padding=dec_len)
        seq_weight = seq_weight.astype(np.float32)
        features = ds.LSeq2SeqFeatureTuple(enc, enc_len, in_dec, dec_len,
                                           label, mask)
        labels = ds.SeqLabelTuple(out_dec, token_weight, seq_weight)
        yield ds.BatchTuple(features, labels, num_tokens, False)
Example #5
0
def seq_batch_iter(in_data,
                   out_data,
                   weights,
                   batch_size=1,
                   shuffle=True,
                   keep_sentence=True):
    """wrapper of batch_iter to format seq data"""
    keep_state = not keep_sentence
    # add one more argumennt and pass it to "batch_iter" below
    # also add 0 for the padding
    if weights:
        # import pdb; pdb.set_trace()
        for x, y, w in batch_iter(batch_size,
                                  shuffle,
                                  in_data,
                                  out_data,
                                  weights,
                                  pad=[[], [], 0]):
            x_arr, x_len = util.hstack_list(x)
            y_arr, y_len = util.hstack_list(y)
            # w_arr, w_len = util.hstack_list(w)
            # change seq_weight to be the input weight
            seq_weight = np.where(y_len > 0, w, 0).astype(np.float32)
            # import pdb; pdb.set_trace()
            token_weight, num_tokens = util.masked_full_like(
                y_arr, w, num_non_padding=y_len)
            features = ds.SeqFeatureTuple(x_arr, x_len)
            labels = ds.SeqLabelTuple(y_arr, token_weight, seq_weight)
            yield ds.BatchTuple(features, labels, num_tokens, keep_state)
    else:
        for x, y in batch_iter(batch_size,
                               shuffle,
                               in_data,
                               out_data,
                               pad=[[], []]):
            x_arr, x_len = util.hstack_list(x)
            y_arr, y_len = util.hstack_list(y)
            seq_weight = np.where(y_len > 0, 1, 0).astype(np.float32)
            token_weight, num_tokens = util.masked_full_like(
                y_arr, 1, num_non_padding=y_len)
            features = ds.SeqFeatureTuple(x_arr, x_len)
            labels = ds.SeqLabelTuple(y_arr, token_weight, seq_weight)
            yield ds.BatchTuple(features, labels, num_tokens, keep_state)
Example #6
0
def concat_seq_batch(batch1, batch2):
    _f1, _l1, _n1, _k1 = batch1
    _f2, _l2, _n2, _k2 = batch2
    inputs = util.hstack_with_padding(_f1.inputs, _f2.inputs)
    seq_len = np.concatenate((_f1.seq_len, _f2.seq_len))
    f = ds.SeqFeatureTuple(inputs, seq_len)
    label = util.hstack_with_padding(_l1.label, _l2.label)
    label_weight = util.hstack_with_padding(_l1.label_weight, _l2.label_weight)
    _l2.seq_weight[:] = 0
    seq_weight = np.concatenate((_l1.seq_weight, _l2.seq_weight))
    l = ds.SeqLabelTuple(label, label_weight, seq_weight)
    return ds.BatchTuple(f, l, _n1 + _n2, _k1)
Example #7
0
def _format_word2def(x, w, c, y, sw):
    enc, enc_len = util.hstack_list(x)
    dec, dec_len = util.hstack_list(y)
    word = np.array(w, dtype=np.int32)
    char, char_len = util.vstack_list(c)
    in_dec = dec[:-1, :]
    out_dec = dec[1:, :]
    seq_weight = np.array(sw, dtype=np.float32)
    dec_len -= np.where(dec_len > 0, 1, 0)
    token_weight, num_tokens = util.masked_full_like(out_dec,
                                                     1,
                                                     num_non_padding=dec_len)
    seq_weight = seq_weight.astype(np.float32)
    features = ds.Word2DefFeatureTuple(enc, enc_len, word, char, char_len,
                                       in_dec, dec_len)
    labels = ds.SeqLabelTuple(out_dec, token_weight, seq_weight)
    return ds.BatchTuple(features, labels, num_tokens, False)
Example #8
0
def concat_word2def_batch(batch1, batch2):
    _f1, _l1, _n1, _k1 = batch1
    _f2, _l2, _n2, _k2 = batch2
    enc_inputs = util.hstack_with_padding(_f1.enc_inputs, _f2.enc_inputs)
    enc_seq_len = np.concatenate((_f1.enc_seq_len, _f2.enc_seq_len))
    words = np.concatenate((_f1.words, _f2.words))
    chars = util.vstack_with_padding(_f1.chars, _f2.chars)
    char_len = np.concatenate((_f1.char_len, _f2.char_len))
    dec_inputs = util.hstack_with_padding(_f1.dec_inputs, _f2.dec_inputs)
    dec_seq_len = np.concatenate((_f1.dec_seq_len, _f2.dec_seq_len))
    f = ds.Word2DefFeatureTuple(enc_inputs, enc_seq_len, words, chars,
                                char_len, dec_inputs, dec_seq_len)
    label = util.hstack_with_padding(_l1.label, _l2.label)
    label_weight = util.hstack_with_padding(_l1.label_weight, _l2.label_weight)
    seq_weight = np.concatenate((_l1.seq_weight, _l2.seq_weight))
    l = ds.SeqLabelTuple(label, label_weight, seq_weight)
    return ds.BatchTuple(f, l, _n1 + _n2, False)
Example #9
0
def seq2seq_batch_iter(enc_data, dec_data, batch_size=1, shuffle=True):
    """wrapper of batch_iter to format seq2seq data"""
    for x, y in batch_iter(batch_size,
                           shuffle,
                           enc_data,
                           dec_data,
                           pad=[[], []]):
        enc, enc_len = util.hstack_list(x)
        dec, dec_len = util.hstack_list(y)
        in_dec = dec[:-1, :]
        out_dec = dec[1:, :]
        seq_weight = np.where(dec_len > 0, 1, 0)
        dec_len -= seq_weight
        token_weight, num_tokens = util.masked_full_like(
            out_dec, 1, num_non_padding=dec_len)
        seq_weight = seq_weight.astype(np.float32)
        features = ds.Seq2SeqFeatureTuple(enc, enc_len, in_dec, dec_len)
        labels = ds.SeqLabelTuple(out_dec, token_weight, seq_weight)
        yield ds.BatchTuple(features, labels, num_tokens, False)
Example #10
0
def seq_batch_iter(in_data,
                   out_data,
                   batch_size=1,
                   shuffle=True,
                   keep_sentence=True):
    """wrapper of batch_iter to format seq data"""
    keep_state = not keep_sentence
    for x, y in batch_iter(batch_size,
                           shuffle,
                           in_data,
                           out_data,
                           pad=[[], []]):
        x_arr, x_len = util.hstack_list(x)
        y_arr, y_len = util.hstack_list(y)
        seq_weight = np.where(y_len > 0, 1, 0).astype(np.float32)
        token_weight, num_tokens = util.masked_full_like(y_arr,
                                                         1,
                                                         num_non_padding=y_len)
        features = ds.SeqFeatureTuple(x_arr, x_len)
        labels = ds.SeqLabelTuple(y_arr, token_weight, seq_weight)
        yield ds.BatchTuple(features, labels, num_tokens, keep_state)
Example #11
0
 def fill_data(pos):
     return ds.BatchTuple(pos, labels, batch_size, keep_state)
Example #12
0
 def fill_data(pos):
     return ds.BatchTuple(pos, labels, np.sum(num_tokens[list(pos)]),
                          keep_state)