Esempio n. 1
0
 def test_convert_tokens_to_ids(self):
     vocab_tokens = [
         "[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un",
         "runn", "##ing"
     ]
     vocab = {token: i for i, token in enumerate(vocab_tokens)}
     self.assertAllEqual(
         tokenization.convert_tokens_to_ids(
             vocab, ["un", "##want", "##ed", "runn", "##ing"]),
         [7, 4, 5, 8, 9])
Esempio n. 2
0
  def test_convert_tokens_to_ids(self):
    vocab_tokens = [
        "[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn",
        "##ing"
    ]

    vocab = {}
    for (i, token) in enumerate(vocab_tokens):
      vocab[token] = i

    self.assertAllEqual(
        tokenization.convert_tokens_to_ids(
            vocab, ["un", "##want", "##ed", "runn", "##ing"]), [7, 4, 5, 8, 9])
Esempio n. 3
0
    def test_convert_tokens_to_ids(self):
        vocab_tokens = [
            "[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn",
            "##ing"
        ]

        vocab = {}
        for (i, token) in enumerate(vocab_tokens):
            vocab[token] = i

        self.assertListEqual(
            tokenization.convert_tokens_to_ids(
                vocab, ["un", "##want", "##ed", "runn", "##ing"]), [7, 4, 5, 8, 9])
Esempio n. 4
0
    def test_convert_tokens_to_ids(self):
        vocab_tokens = [
            '[UNK]', '[CLS]', '[SEP]', 'want', '##want', '##ed', 'wa', 'un',
            'runn', '##ing'
        ]

        vocab = {}
        for (i, token) in enumerate(vocab_tokens):
            vocab[token] = i

        self.assertAllEqual(
            tokenization.convert_tokens_to_ids(
                vocab, ['un', '##want', '##ed', 'runn', '##ing']),
            [7, 4, 5, 8, 9])
def get_padded_tokens(
    tokens,
    tags,
    flags,
    bounds,
    extra_features,
    vocabs,
    max_seq_length,
):
    max_seq_length += 64
    tokens.extend('有哪些金融公司、平台、中心、币、银行、基金、外汇、集团、链、股份、商城、店、资本、家园、金服、交易所、理财、贷款')
    tokens.append('[SEP]')
    tokens = [
        token.lower() if token not in ['[CLS]', '[SEP]'] else token
        for token in tokens
    ]
    tokens = [token if token in vocabs else '[UNK]' for token in tokens]
    input_ids = tokenization.convert_tokens_to_ids(vocabs, tokens)
    input_mask = [1] * len(input_ids)
    # tag_ids = [BIO_TAG2ID[tag] for tag in tags]
    begin_tag_ids = [1 if tag == 'B' else 0 for tag in tags]
    end_tag_ids = [1 if tag == 'E' else 0 for tag in tags]
    pairs = get_span_from_tags(tags)

    # flag_ids = [POS_FLAGS_TO_IDS[flag] for flag in flags]
    # bound_ids = [WORD_BOUNDS_TO_IDS[bound] for bound in bounds]
    assert len(input_ids) <= max_seq_length, "len:{}".format(len(input_ids))

    # to_pad = [0] * (max_seq_length - len(input_ids))
    # fea_to_pad = [[0] * len(extra_features[0])] * (max_seq_length - len(input_ids))
    while len(begin_tag_ids) < len(tokens):
        begin_tag_ids.append(-1)
        end_tag_ids.append(-1)
    while len(input_ids) < max_seq_length:
        input_ids.append(0)
        input_mask.append(0)
        # flag_ids.append(0)
        begin_tag_ids.append(-1)
        end_tag_ids.append(-1)
        # bound_ids.append(0)
        # extra_features.append([0]*len(extra_features[0]))

    assert len(input_ids) == max_seq_length
    assert len(input_mask) == max_seq_length
    assert len(begin_tag_ids) == max_seq_length
    assert len(end_tag_ids) == max_seq_length
    return input_ids, input_mask, (begin_tag_ids, end_tag_ids, pairs
                                   )  # , flag_ids, bound_ids, extra_features
Esempio n. 6
0
def get_padded_tokens(tokens,
                      tags,
                      flags,
                      bounds,
                      extra_features,
                      vocabs,
                      max_seq_length,
                      pad='after'):
    tokens = [
        token.lower() if token not in ['[CLS]', '[SEP]'] else token
        for token in tokens
    ]
    tokens = [token if token in vocabs else '[UNK]' for token in tokens]
    input_ids = tokenization.convert_tokens_to_ids(vocabs, tokens)
    input_mask = [1] * len(input_ids)
    tag_ids = [BIO_TAG2ID[tag] for tag in tags]
    flag_ids = [POS_FLAGS_TO_IDS[flag] for flag in flags]
    bound_ids = [WORD_BOUNDS_TO_IDS[bound] for bound in bounds]
    assert len(input_ids) <= max_seq_length, "len:{}".format(len(input_ids))

    to_pad = [0] * (max_seq_length - len(input_ids))
    fea_to_pad = [[0] * len(extra_features[0])
                  ] * (max_seq_length - len(input_ids))
    if pad == 'before':
        input_ids = to_pad + input_ids
        input_mask = to_pad + input_mask
        tag_ids = to_pad + tag_ids
        flag_ids = to_pad + flag_ids
        bound_ids = to_pad + bound_ids
        extra_features = fea_to_pad + extra_features
    elif pad == 'after':
        input_ids = input_ids + to_pad
        input_mask = input_mask + to_pad
        tag_ids = tag_ids + to_pad
        flag_ids = flag_ids + to_pad
        bound_ids = bound_ids + to_pad
        extra_features = extra_features + fea_to_pad

    assert len(input_ids) == max_seq_length
    assert len(input_mask) == max_seq_length
    assert len(tag_ids) == max_seq_length
    return input_ids, input_mask, tag_ids, flag_ids, bound_ids, extra_features
    def __init__(self):
        vocab_file = 'vocab.txt'
        vocab = tokenization.load_vocab(vocab_file=vocab_file)
        tokenizer = tokenization.WordpieceTokenizer(vocab=vocab)
        path = 'train_processed.txt'

        train_file = open(path, 'r', encoding='utf-8')
        lines = train_file.read().split('\n')

        max_length = 0

        for i in range(len(lines)):
            TK = lines[i].split(' \t')

            if max_length < len(TK[0]):
                max_length = len(TK[0])

        max_length += 1

        self.input_ids = np.zeros(shape=[len(lines), max_length],
                                  dtype=np.int32)
        self.input_mask = np.zeros(shape=[len(lines), max_length],
                                   dtype=np.int32)
        self.label = np.zeros(shape=[len(lines)], dtype=np.int32)

        for i in range(len(lines) - 1):
            TK = lines[i].split(' \t')
            if len(TK) != 2:
                TK = lines[i].split('\t')

            sentence = TK[0]
            token = tokenizer.tokenize(sentence)
            tk_ids = tokenization.convert_tokens_to_ids(vocab=vocab,
                                                        tokens=token)

            for j in range(len(tk_ids)):
                self.input_ids[i, j + 1] = tk_ids[j]
                self.input_mask[i, j + 1] = 1
            self.input_ids[i, 0] = tokenization.convert_tokens_to_ids(
                vocab=vocab, tokens=['[CLS]'])[0]
            self.input_mask[i, 0] = 1
            self.label[i] = int(TK[1])

        path = 'test_processed.txt'

        test_file = open(path, 'r', encoding='utf-8')
        lines = test_file.read().split('\n')

        max_length = 0

        for i in range(len(lines)):
            TK = lines[i].split(' \t')

            if max_length < len(TK[0]):
                max_length = len(TK[0])

        print(max_length)
        max_length += 1

        self.test_input_ids = np.zeros(shape=[len(lines), max_length],
                                       dtype=np.int32)
        self.test_input_ids_masking = np.zeros(shape=[len(lines), max_length],
                                               dtype=np.int32)
        self.test_label = np.zeros(shape=[len(lines)], dtype=np.int32)

        for i in range(len(lines) - 1):
            TK = lines[i].split(' \t')
            if len(TK) != 2:
                TK = lines[i].split('\t')

            sentence = TK[0]
            token = tokenizer.tokenize(sentence)
            tk_ids = tokenization.convert_tokens_to_ids(vocab=vocab,
                                                        tokens=token)

            for j in range(len(tk_ids)):
                self.test_input_ids[i, j + 1] = tk_ids[j]
                self.test_input_ids_masking[i, j + 1] = 1
            self.test_input_ids[i, 0] = tokenization.convert_tokens_to_ids(
                vocab=vocab, tokens=['[CLS]'])[0]
            self.test_input_ids_masking[i, 0] = 1

            self.test_label[i] = int(TK[1])

        self.Batch_Size = 8

        self.random_idx = np.array(range(self.label.shape[0]), dtype=np.int32)
        np.random.shuffle(self.random_idx)

        self.Batch_Idx = 0
        self.Test_Batch_Idx = 0
Esempio n. 8
0
 def convert_tokens_to_ids(self, tokens):
     return tokenization.convert_tokens_to_ids(self.vocab, tokens)
Esempio n. 9
0
import tensorflow as tf
import tokenization

vocab_pass = '******'

p_tokens = [
    '[Subject]', 'John', 'and', 'Michael', '[equalTo]', 'genius', '[when]',
    'morning'
]
predict_ids = tokenization.convert_tokens_to_ids(
    tokenization.load_vocab(vocab_pass), p_tokens)
r_tokens = [
    '[Subject]', 'John', 'and', 'Michael', '[equalTo]', 'smart', '[when]',
    'afternoon', 'and', 'evening'
]
real_ids = tokenization.convert_tokens_to_ids(
    tokenization.load_vocab(vocab_pass), r_tokens)
predict_tensor = tf.constant([[1, 2, 3], [98, 1, 6], [1, 2, 4], [22, 1, 6],
                              [3, 2, 3], [7, 1, 6], [0, 2, 3], [11, 1, 9]],
                             dtype=float)
real_tensor = tf.constant(
    [[1, 2, 3], [12, 8, 1], [1, 2, 4], [12, 8, 1], [3, 2, 3], [12, 8, 1],
     [0, 2, 3], [12, 8, 1], [1, 2, 4], [12, 8, 1]],
    dtype=float)


def is_special_id(id):
    vocab = tokenization.load_vocab(vocab_pass)
    inv_vocab = {v: k for k, v in vocab.items()}
    special_tokens = []
    for token in vocab.keys():