Esempio n. 1
0
class BERTVectorizer:

    def __init__(self, sess, bert_model_hub_path):
        self.sess = sess
        self.bert_model_hub_path = bert_model_hub_path
        self.create_tokenizer_from_hub_module()

    def create_tokenizer_from_hub_module(self):
        # get the vocabulary and lowercasing or uppercase information directly from the BERT tf hub module
        bert_module = hub.Module(self.bert_model_hub_path)
        tokenization_info = bert_module(signature="tokenization_info", as_dict=True)
        vocab_file, do_lower_case = self.sess.run(
            [
                tokenization_info["vocab_file"],
                tokenization_info["do_lower_case"]
            ]
        )
        self.tokenizer = FullTokenizer(vocab_file=vocab_file, do_lower_case=do_lower_case) #do_lower_case=True
        # print(tokenizer.tokenize('hello world!'))  --> ['hello', 'world', '!']

    def tokenize(self, text:str): ## tokenize every sentence
        words = text.split()
        ## # text: add leah kauffman to my uncharted 4 nathan drake playlist
        ## # words: ['add', 'leah', 'kauffman', 'to', 'my', 'uncharted', '4', 'nathan', 'drake', 'playlist']
        tokens = []
        ## # tokens: ['add', 'leah', 'ka', '##uf', '##fm', '##an', 'to', 'my', 'un', '##cha', '##rted', '4', 'nathan', 'drake', 'play', '##list']
        valid_positions = []
        ## # valid_positions:[1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0]
        for i, word in enumerate(words):
            token = self.tokenizer.tokenize(word)
            tokens.extend(token)
            for i in range(len(token)):
                if i == 0:
                    valid_positions.append(1)
                else:
                    valid_positions.append(0)
        return tokens, valid_positions

    def transform(self, text_arr):
        input_ids = []
        input_mask = []
        segment_ids = []
        valid_positions = []
        for text in text_arr:
            ids, mask, seg_ids, valid_pos = self.__vectorize(text)
            input_ids.append(ids)
            input_mask.append(mask)
            segment_ids.append(seg_ids)
            valid_positions.append(valid_pos)

        sequence_length = np.array([len(i) for i in input_ids])

        ## set the maximum length is 50
        input_ids = tf.keras.preprocessing.sequence.pad_sequences(input_ids, maxlen=50, truncating='post', padding='post')
        input_mask = tf.keras.preprocessing.sequence.pad_sequences(input_mask, maxlen=50, truncating='post', padding='post')
        segment_ids = tf.keras.preprocessing.sequence.pad_sequences(segment_ids, maxlen=50, truncating='post', padding='post')
        valid_positions = tf.keras.preprocessing.sequence.pad_sequences(valid_positions, maxlen=50, truncating='post', padding='post')

        # input_ids = tf.keras.preprocessing.sequence.pad_sequences(input_ids, padding='post')
        # input_mask = tf.keras.preprocessing.sequence.pad_sequences(input_mask, padding='post')
        # segment_ids = tf.keras.preprocessing.sequence.pad_sequences(segment_ids, padding='post')
        # valid_positions = tf.keras.preprocessing.sequence.pad_sequences(valid_positions, padding='post')

        return input_ids, input_mask, segment_ids, valid_positions, sequence_length

    def __vectorize(self, text:str):
        tokens, valid_positions = self.tokenize(text)

        ## insert the first token "[CLS]"
        tokens.insert(0, '[CLS]')
        valid_positions.insert(0, 1)
        ## insert the last token "[SEP]"
        tokens.append('[SEP]')
        valid_positions.append(1)
        ## ['[CLS]', 'add', 'leah', 'ka', '##uf', '##fm', '##an', 'to', 'my', 'un', '##cha', '##rted', '4', 'nathan', 'drake', 'play', '##list', '[SEP]']
        ## [1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1]

        '''
        (a) For sequence pairs:
        #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
        #  type_ids: 0     0  0    0    0     0       0 0     1  1  1  1   1 1
        # (b) For single sequences:
        #  tokens:   [CLS] the dog is hairy . [SEP]
        #  type_ids: 0     0   0   0  0     0 0
        
        Where "type_ids" are used to indicate whether this is the first
        # sequence or the second sequence. The embedding vectors for `type=0` and
        # `type=1` were learned during pre-training and are added to the wordpiece
        # embedding vector (and position vector). This is not *strictly* necessary
        # since the [SEP] token unambiguously separates the sequences, but it makes
        # it easier for the model to learn the concept of sequences.
        '''
        segment_ids = [0] * len(tokens)
        ## # segment_ids: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
        input_ids = self.tokenizer.convert_tokens_to_ids(tokens)
        ## # input_ids: [101, 5587, 14188, 10556, 16093, 16715, 2319, 2000, 2026, 4895, 7507, 17724, 1018, 7150, 7867, 2377, 9863, 102] and the first is always 101 and the last is 102

        input_mask = [1] * len(input_ids) ## The mask has 1 for real tokens and 0 for padding tokens.
        ## # input_mask: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]

        return input_ids, input_mask, segment_ids, valid_positions
from sequence_transfer.sequence import CharSequence, TokenSequence
from sequence_transfer.magic_transfer import MagicTransfer
from bert.tokenization import FullTokenizer


text = "She lives in Lindström, Minnesota"
tokenizer = FullTokenizer('../vocab.txt')
tokens = tokenizer.tokenize(text)

# 01 - We create sequences
s1 = CharSequence.new(text)
s2 = TokenSequence.new(tokens)

# 02 - We create a magic transfer that will try to match "similar" subsequences between s1 and s2:
transfer = MagicTransfer(s1, s2)

# 03 - We will use the transfer object to find the tokens that correspond to the word `Lindström`
sub1 = s1[13, 22]  # `Lindström` in s1
sub2 = transfer.apply(sub1)
for token in sub2:
    print(token.text)

# 04 - We can debug the transfer
transfer.debug()
Esempio n. 3
0
class BERTVectorizer:
    def __init__(
        self,
        sess,
        is_bert,
        #                 bert_model_hub_path='https://tfhub.dev/google/bert_uncased_L-12_H-768_A-12/1'
        bert_model_hub_path="https://tfhub.dev/google/albert_base/1"):
        self.sess = sess
        self.is_bert = is_bert
        self.bert_model_hub_path = bert_model_hub_path
        self.create_tokenizer_from_hub_module(is_bert=is_bert)

    def create_tokenizer_from_hub_module(self, is_bert):
        """Get the vocab file and casing info from the Hub module."""
        bert_module = hub.Module(self.bert_model_hub_path)
        tokenization_info = bert_module(signature="tokenization_info",
                                        as_dict=True)
        vocab_file, do_lower_case = self.sess.run([
            tokenization_info["vocab_file"],
            tokenization_info["do_lower_case"],
        ])

        if is_bert:
            from bert.tokenization import FullTokenizer
            self.tokenizer = FullTokenizer(vocab_file=vocab_file,
                                           do_lower_case=do_lower_case)
        else:
            from opennlu.services.tensorflow_JointBERT.vectorizers.albert_tokenization import FullTokenizer
            self.tokenizer = FullTokenizer(vocab_file=vocab_file,
                                           do_lower_case=do_lower_case,
                                           spm_model_file=vocab_file)

    def tokenize(self, text: str):
        words = text.split()  # whitespace tokenizer
        tokens = []
        valid_positions = []
        for i, word in enumerate(words):
            token = self.tokenizer.tokenize(word)
            tokens.extend(token)
            for i in range(len(token)):
                if i == 0:
                    valid_positions.append(1)
                else:
                    valid_positions.append(0)
        return tokens, valid_positions

    def transform(self, text_arr):
        input_ids = []
        input_mask = []
        segment_ids = []
        valid_positions = []
        for text in text_arr:
            ids, mask, seg_ids, valid_pos = self.__vectorize(text)
            input_ids.append(ids)
            input_mask.append(mask)
            segment_ids.append(seg_ids)
            valid_positions.append(valid_pos)

        sequence_lengths = np.array([len(i) for i in input_ids])
        input_ids = tf.keras.preprocessing.sequence.pad_sequences(
            input_ids, padding='post')
        input_mask = tf.keras.preprocessing.sequence.pad_sequences(
            input_mask, padding='post')
        segment_ids = tf.keras.preprocessing.sequence.pad_sequences(
            segment_ids, padding='post')
        valid_positions = tf.keras.preprocessing.sequence.pad_sequences(
            valid_positions, padding='post')
        return input_ids, input_mask, segment_ids, valid_positions, sequence_lengths

    def __vectorize(self, text: str):
        tokens, valid_positions = self.tokenize(text)
        # insert "[CLS]"
        tokens.insert(0, '[CLS]')
        valid_positions.insert(0, 1)
        # insert "[SEP]"
        tokens.append('[SEP]')
        valid_positions.append(1)

        segment_ids = [0] * len(tokens)
        input_ids = self.tokenizer.convert_tokens_to_ids(tokens)

        input_mask = [1] * len(input_ids)

        return input_ids, input_mask, segment_ids, valid_positions
Esempio n. 4
0
class BERTEmbeddingEvaluator(SentenceEmbeddingEvaluator):
    def __init__(
            self,
            model_fname="/notebooks/embedding/data/sentence-embeddings/bert/tune-ckpt",
            bertconfig_fname="/notebooks/embedding/data/sentence-embeddings/bert/multi_cased_L-12_H-768_A-12/bert_config.json",
            vocab_fname="/notebooks/embedding/data/sentence-embeddings/bert/multi_cased_L-12_H-768_A-12/vocab.txt",
            max_seq_length=32,
            dimension=768,
            num_labels=2):

        super().__init__("bert", dimension)
        config = BertConfig.from_json_file(bertconfig_fname)
        self.max_seq_length = max_seq_length
        self.tokenizer = FullTokenizer(vocab_file=vocab_fname,
                                       do_lower_case=False)
        self.model, self.input_ids, self.input_mask, self.segment_ids, self.probs = make_bert_graph(
            config, max_seq_length, 1.0, num_labels, tune=False)
        saver = tf.train.Saver(tf.global_variables())
        self.sess = tf.Session()
        checkpoint_path = tf.train.latest_checkpoint(model_fname)
        saver.restore(self.sess, checkpoint_path)

    def predict(self, sentence):
        tokens = self.tokenize(sentence)
        model_input = self.make_input(tokens)
        probs = self.sess.run(self.probs, model_input)
        return probs

    """
    sentence를 입력하면 토크나이즈 결과와 token 벡터 시퀀스를 반환한다
        - shape :[[# of tokens], [batch size, max seq length, dimension]]
    """

    def get_token_vector_sequence(self, sentence):
        tokens = self.tokenize(sentence)
        model_input = self.make_input(tokens)
        return [
            tokens,
            self.sess.run(self.model.get_sequence_output()[0],
                          model_input)[:len(tokens) + 2]
        ]

    """
    sentence를 입력하면 토크나이즈 결과와 [CLS] 벡터를 반환한다
         - shape :[[# of tokens], [batch size, dimension]]
    """

    def get_sentence_vector(self, sentence):
        tokens = self.tokenize(sentence)
        model_input = self.make_input(tokens)
        return [
            tokens,
            self.sess.run(self.model.pooled_output, model_input)[0]
        ]

    """
    sentence를 입력하면 토크나이즈 결과와 self-attention score matrix를 반환한다
        - shape :[[# of tokens], [batch size, # of tokens, # of tokens]]
    """

    def get_self_attention_score(self, sentence):
        tokens = self.tokenize(sentence)
        model_input = self.make_input(tokens)
        # raw_score : shape=[# of layers, batch_size, num_attention_heads, max_seq_length, max_seq_length]
        raw_score = self.sess.run(self.model.attn_probs_for_visualization_list,
                                  model_input)
        # 마지막 레이어를 취한 뒤, attention head 기준(axis=0)으로 sum
        scores = np.sum(raw_score[-1][0], axis=0)
        # scores matrix에서 토큰 개수만큼 취함
        scores = scores[:len(tokens), :len(tokens)]
        return [tokens, scores]

    def tokenize(self, sentence):
        return self.tokenizer.tokenize(convert_to_unicode(sentence))

    def make_input(self, tokens):
        tokens = tokens[:(self.max_seq_length - 2)]
        token_sequence = ["[CLS]"] + tokens + ["[SEP]"]
        segment = [0] * len(token_sequence)
        sequence = self.tokenizer.convert_tokens_to_ids(token_sequence)
        current_length = len(sequence)
        padding_length = self.max_seq_length - current_length
        input_feed = {
            self.input_ids:
            np.array([sequence + [0] * padding_length]),
            self.segment_ids:
            np.array([segment + [0] * padding_length]),
            self.input_mask:
            np.array([[1] * current_length + [0] * padding_length])
        }
        return input_feed

    def visualize_self_attention_scores(self, sentence, palette="Viridis256"):
        tokens, scores = self.get_self_attention_score(sentence)
        visualize_self_attention_scores(tokens, scores, palette)
Esempio n. 5
0
def build_dataset(conll_file,
                  tfrecod_file,
                  pos2id,
                  dep2id,
                  path2id,
                  truncate=False):
    max_len = 0

    tokenizer = FullTokenizer(vocab_file=VOCAB_FILE,
                              do_lower_case=DO_LOWER_CASE)

    with open(conll_file, 'r') as reader:
        text = reader.read().strip()
    sentences = text.split('\n\n')

    tf_writer = tf.python_io.TFRecordWriter(tfrecod_file)
    for sent in sentences:
        subword_list = ["[CLS]"]
        span_list = [0]
        mask_list = [0]
        cue_list = [0]

        pos_list = [0]
        dep_list = [0]
        path_list = [0]
        lpath_list = [-1]
        cp_list = [-1]

        subword_id_list = tokenizer.convert_tokens_to_ids(["[CLS]"])

        for token in sent.split('\n'):
            if len(token) >= 8:
                token = token.split('\t')

                token_ = token[0]
                subword = tokenizer.tokenize(token_)

                span = [int(token[8]) for _ in range(len(subword))]
                cue = [int(token[7]) for _ in range(len(subword))]

                pos = [
                    int(mapping(pos2id, token[2])) for _ in range(len(subword))
                ]
                dep = [
                    int(mapping(dep2id, token[3])) for _ in range(len(subword))
                ]
                path = [
                    int(mapping(path2id, token[4]))
                    for _ in range(len(subword))
                ]
                lpath = [int(token[5]) for _ in range(len(subword))]
                cp = [int(token[6]) for _ in range(len(subword))]

                mask = [0 for _ in range(len(subword))]
                mask[0] = 1

                sub_id = tokenizer.convert_tokens_to_ids(subword)

                subword_list.extend(subword)
                mask_list.extend(mask)
                subword_id_list.extend(sub_id)

                pos_list.extend(pos)
                dep_list.extend(dep)
                path_list.extend(path)
                lpath_list.extend(lpath)
                cp_list.extend(cp)

                cue_list.extend(cue)
                span_list.extend(span)

        subword_list.append("[SEP]")
        span_list.append(0)
        cue_list.append(0)
        mask_list.append(0)
        subword_id_list.extend(tokenizer.convert_tokens_to_ids(["[SEP]"]))

        pos_list.append(0)
        dep_list.append(0)
        path_list.append(0)
        lpath_list.append(-1)
        cp_list.append(-1)

        assert len(subword_list) == len(span_list) == len(mask_list) == len(
            subword_id_list)

        max_len = max(max_len, len(subword_id_list))

        if len(subword_list) > 2:
            if (not truncate) or (len(subword_id_list) <= 64):
                # write tfrecord
                token_id = [
                    tf.train.Feature(int64_list=tf.train.Int64List(value=[t_]))
                    for t_ in subword_id_list
                ]
                mask = [
                    tf.train.Feature(int64_list=tf.train.Int64List(value=[m_]))
                    for m_ in mask_list
                ]
                span = [
                    tf.train.Feature(int64_list=tf.train.Int64List(value=[s_]))
                    for s_ in span_list
                ]
                cue = [
                    tf.train.Feature(int64_list=tf.train.Int64List(value=[c_]))
                    for c_ in cue_list
                ]

                pos_features = [
                    tf.train.Feature(int64_list=tf.train.Int64List(
                        value=[pos_])) for pos_ in pos_list
                ]
                dep_features = [
                    tf.train.Feature(int64_list=tf.train.Int64List(
                        value=[dep_])) for dep_ in dep_list
                ]
                path_features = [
                    tf.train.Feature(int64_list=tf.train.Int64List(
                        value=[path_])) for path_ in path_list
                ]
                lpath_features = [
                    tf.train.Feature(int64_list=tf.train.Int64List(
                        value=[lpath_])) for lpath_ in lpath_list
                ]
                cp_features = [
                    tf.train.Feature(int64_list=tf.train.Int64List(
                        value=[cp_])) for cp_ in cp_list
                ]

                feature_list = {
                    'token_id': tf.train.FeatureList(feature=token_id),
                    'span': tf.train.FeatureList(feature=span),
                    'masks': tf.train.FeatureList(feature=mask),
                    'cue': tf.train.FeatureList(feature=cue),
                    'pos': tf.train.FeatureList(feature=pos_features),
                    'dep': tf.train.FeatureList(feature=dep_features),
                    'path': tf.train.FeatureList(feature=path_features),
                    'lpath': tf.train.FeatureList(feature=lpath_features),
                    'cp': tf.train.FeatureList(feature=cp_features),
                }

                context = tf.train.Features(
                    feature={
                        "length":
                        tf.train.Feature(int64_list=tf.train.Int64List(
                            value=[len(subword_id_list)])),
                    })

                feature_lists = tf.train.FeatureLists(
                    feature_list=feature_list)
                ex = tf.train.SequenceExample(feature_lists=feature_lists,
                                              context=context)
                tf_writer.write(ex.SerializeToString())

    tf_writer.close()
from bert.tokenization import FullTokenizer
from sacremoses import MosesTokenizer
from sequence_transfer.sequence import TokenSequence
from sequence_transfer.magic_transfer import MagicTransfer

# 01 - Create tokenizer
moses_tokenizer = MosesTokenizer('en')
bert_tokenizer = FullTokenizer('../vocab.txt')

# 02 - Create tokens
text = "She lives in Lindström, Minnesota"
moses_tokens = moses_tokenizer.tokenize(text)
bert_tokens = bert_tokenizer.tokenize(text)

# 03 - Create sequences
s1 = TokenSequence.new(moses_tokens)
s2 = TokenSequence.new(bert_tokens)

# 04 - Create transfer
transfer = MagicTransfer(s1, s2)

# 05 - We will use the transfer object to find the bert tokens that correspond to the third moses token (`Lindström`)
sub1 = s1[3]  # `Lindström`
sub2 = transfer.apply(sub1)
for token in sub2:
    print(token.text)

# 04 - We can debug the transfer
transfer.debug()