def _ner_bert_tokenize(
        tokens: List[str],
        mask: List[int],
        tags: List[str],
        tokenizer: FullTokenizer,
        max_subword_len: int = None,
        mode: str = None,
        token_maksing_prob: float = 0.0
    ) -> Tuple[List[str], List[int], List[str]]:
        tokens_subword = ['[CLS]']
        mask_subword = [0]
        tags_subword = ['X']
        for token, flag, tag in zip(tokens, mask, tags):
            subwords = tokenizer.tokenize(token)
            if not subwords or \
                    ((max_subword_len is not None) and (len(subwords) > max_subword_len)):
                tokens_subword.append('[UNK]')
                mask_subword.append(flag)
                tags_subword.append(tag)
            else:
                if mode == 'train' and token_maksing_prob > 0.0 and np.random.rand(
                ) < token_maksing_prob:
                    tokens_subword.extend(['[MASK]'] * len(subwords))
                else:
                    tokens_subword.extend(subwords)
                mask_subword.extend([flag] + [0] * (len(subwords) - 1))
                tags_subword.extend([tag] + ['X'] * (len(subwords) - 1))

        tokens_subword.append('[SEP]')
        mask_subword.append(0)
        tags_subword.append('X')

        return tokens_subword, mask_subword, tags_subword
Example #2
0
def parse_text(text):
    sentences = text.split('\n\n')

    all_pos = Counter()
    all_dep = Counter()
    all_path = Counter()
    all_vocab = Counter()

    tokenizer = FullTokenizer(vocab_file=VOCAB_FILE,
                              do_lower_case=DO_LOWER_CASE)

    for sentence in sentences:
        token_sequence = []

        for token in sentence.split('\n'):
            if len(token) >= 8:
                token = token.split('\t')
                token_sequence.append(token)

        subwords = sum(
            [tokenizer.tokenize(item[0]) for item in token_sequence], [])
        all_vocab.update(subwords)
        all_pos.update([item[2] for item in token_sequence])
        all_dep.update([item[3] for item in token_sequence])
        all_path.update([item[4] for item in token_sequence])

    return all_pos, all_dep, all_path, all_vocab
Example #3
0
def bert_tokenize(vocab_fname, corpus_fname, output_fname):
    tokenizer = FullTokenizer(vocab_file=vocab_fname, do_lower_case=False)
    with open(corpus_fname, 'r', encoding='utf-8') as f1, \
            open(output_fname, 'w', encoding='utf-8') as f2:
        for line in f1:
            sentence = line.replace('\n', '').strip()
            tokens = tokenizer.tokenize(convert_to_unicode(sentence))
            tokenized_sent = ' '.join(tokens)
            f2.writelines(tokenized_sent + '\n')
class BertTokenizer:
    def __init__(self, bert_path, tokenizer_cls=FullTokenizer, maxlen=512):
        self.maxlen = maxlen
        # with tf.compat.v1.Session() as sess:
        #     bert = hub.Module(bert_path)
        #     tk_info = bert(signature='tokenization_info', as_dict=True)
        #     tk_info = [tk_info['vocab_file'], tk_info['do_lower_case']]
        #     vocab_file, do_lower_case = sess.run(tk_info)
        #     self.tokenizer = tokenizer_cls(vocab_file, do_lower_case)
        bert_layer = hub.KerasLayer(bert_path, trainable=True)
        vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
        do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
        self.tokenizer = FullTokenizer(vocab_file, do_lower_case)

    def convert_sentences_to_ids(self, sentences):
        ids = list(map(self.convert_single_sentence_to_ids, sentences))
        return np.array(ids)

    def convert_single_sentence_to_ids(self, sentence):
        tokens = self.tokenize(sentence)
        tokens = ['[CLS]'] + tokens + ['[SEP]']
        tokens += (self.maxlen - len(tokens)) * ['[PAD]']
        return self.tokenizer.convert_tokens_to_ids(tokens)

    def convert_two_sentence_to_ids(self,
                                    sent1,
                                    sent2,
                                    maxlen=None,
                                    return_tokens=False):
        if not maxlen:
            maxlen = self.maxlen
        tokens1 = self.tokenize(sent1)
        tokens2 = self.tokenize(sent2)
        if len(tokens1) + len(tokens2) > maxlen - 3:
            tokens2 = tokens2[:maxlen - 3 - len(tokens1)]
        tokens = ['[CLS]'] + tokens1 + ['[SEP]'] + tokens2 + ['[SEP]']
        tokens += (maxlen - len(tokens)) * ['[PAD]']
        ids = self.tokenizer.convert_tokens_to_ids(tokens)
        if return_tokens:
            return tokens1, tokens2, ids
        return ids

    def convert_sentence_to_features(self, sent1, sent2, maxlen=None):
        if not maxlen:
            maxlen = self.maxlen

        tokens1, tokens2, token_ids = self.convert_two_sentence_to_ids(
            sent1, sent2, maxlen, return_tokens=True)
        segment_ids = [0] * (len(tokens1) + 2) + [1] * (len(tokens2) + 1)
        input_mask = [1] * len(segment_ids)
        segment_ids += (maxlen - len(segment_ids)) * [0]
        input_mask += (maxlen - len(input_mask)) * [0]

        return token_ids, input_mask, segment_ids

    def tokenize(self, sent):
        return self.tokenizer.tokenize(sent)
class Inferer:
  def __init__(self, checkpoint, attr_values_file, vocab_file):
    self.checkpoint = checkpoint
    self.attr_values_file = attr_values_file
    self.vocab_file = vocab_file
    if not os.path.exists(self.checkpoint):
      raise Exception("local checkpoint %s not exists" % self.checkpoint)
    if not os.path.exists(self.attr_values_file):
      raise Exception("local attr_values_file %s not exists" % self.attr_values_file)
    if not os.path.exists(self.vocab_file):
      raise Exception("local vocab_file %s not exists" % self.vocab_file)
    self.config = InferConfig()
    self.tokenizer = FullTokenizer(self.vocab_file)
    with open(self.attr_values_file, 'rb') as fr:
      attr_values, attr_values_r = pickle.load(fr)
    self.attr_values_r = attr_values_r
    self.config.output_dim = len(attr_values_r)

    self.graph = tf.Graph()
    with self.graph.as_default():
      self.input_ids_p = tf.placeholder(tf.int32, [None, self.config.max_seq_length])
      self.token_type_ids_p = tf.placeholder(tf.int32, [None, self.config.max_seq_length])
      self.input_mask_p = tf.placeholder(tf.int32, [None, self.config.max_seq_length])
      model = Model(self.config)
      self.inference = model.infer(self.input_ids_p, self.token_type_ids_p, self.input_mask_p)
      ckpt_state = tf.train.get_checkpoint_state(self.checkpoint)
      if not (ckpt_state and ckpt_state.model_checkpoint_path):
        raise Exception('No model to eval yet at: ' + self.checkpoint)
      self.sess = tf.Session(config = tf.ConfigProto(allow_soft_placement = True))
      saver = tf.train.Saver()
      saver.restore(self.sess, ckpt_state.model_checkpoint_path)

  def infer(self, sequences):
    transforms = [self._transform(s) for s in sequences if s != '']
    input_ids, token_type_ids, input_mask = list(map(lambda x: list(x), zip(*transforms)))
    with self.graph.as_default():
      result = self.sess.run(self.inference, feed_dict = {
                                                           self.input_ids_p: input_ids,
                                                           self.token_type_ids_p: token_type_ids,
                                                           self.input_mask_p: input_mask
                                                         })
    return [self.attr_values_r[e] for e in result]

  def _transform(self, sequence):
    tokens = self.tokenizer.tokenize(sequence)
    if len(tokens) > self.config.max_seq_length - 2:
      tokens = tokens[0:self.config.max_seq_length - 2]
    tokens = ['[CLS]'] + tokens + ['[SEP]']
    token_ids = self.tokenizer.convert_tokens_to_ids(tokens)

    input_ids_1 = token_ids[0:self.config.max_seq_length] + [0] * (self.config.max_seq_length - len(token_ids))
    token_type_ids_1 = [0] * self.config.max_seq_length
    input_mask_1 = [1] * len(token_ids) + [0] * (self.config.max_seq_length - len(token_ids))
    return input_ids_1, token_type_ids_1, input_mask_1
    def test_compare(self):

        model_dir = tempfile.TemporaryDirectory().name
        os.makedirs(model_dir)
        save_path = MiniBertFactory.create_mini_bert_weights(model_dir)
        tokenizer = FullTokenizer(vocab_file=os.path.join(
            model_dir, "vocab.txt"),
                                  do_lower_case=True)

        # prepare input
        max_seq_len = 16
        input_str = "hello, bert!"
        input_tokens = tokenizer.tokenize(input_str)
        input_tokens = ["[CLS]"] + input_tokens + ["[SEP]"]
        input_ids = tokenizer.convert_tokens_to_ids(input_tokens)
        input_ids = input_ids + [0] * (max_seq_len - len(input_tokens))
        input_mask = [1] * len(input_tokens) + [0] * (max_seq_len -
                                                      len(input_tokens))
        token_type_ids = [0] * len(input_tokens) + [0] * (max_seq_len -
                                                          len(input_tokens))

        input_ids = np.array([input_ids], dtype=np.int32)
        input_mask = np.array([input_mask], dtype=np.int32)
        token_type_ids = np.array([token_type_ids], dtype=np.int32)

        print("   tokens:", input_tokens)
        print(
            "input_ids:{}/{}:{}".format(len(input_tokens), max_seq_len,
                                        input_ids), input_ids.shape,
            token_type_ids)

        bert_1_seq_out = CompareBertActivationsTest.predict_on_stock_model(
            model_dir, input_ids, input_mask, token_type_ids)
        bert_2_seq_out = CompareBertActivationsTest.predict_on_keras_model(
            model_dir, input_ids, input_mask, token_type_ids)

        np.set_printoptions(precision=9,
                            threshold=20,
                            linewidth=200,
                            sign="+",
                            floatmode="fixed")

        print("stock bert res", bert_1_seq_out.shape)
        print("keras bert res", bert_2_seq_out.shape)

        print("stock bert res:\n {}".format(bert_1_seq_out[0, :2, :10]),
              bert_1_seq_out.dtype)
        print("keras bert_res:\n {}".format(bert_2_seq_out[0, :2, :10]),
              bert_2_seq_out.dtype)

        abs_diff = np.abs(bert_1_seq_out - bert_2_seq_out).flatten()
        print("abs diff:", np.max(abs_diff), np.argmax(abs_diff))
        self.assertTrue(np.allclose(bert_1_seq_out, bert_2_seq_out, atol=1e-6))
    def test_finetune(self):

        model_dir = tempfile.TemporaryDirectory().name
        os.makedirs(model_dir)
        save_path = MiniBertFactory.create_mini_bert_weights(model_dir)
        tokenizer = FullTokenizer(vocab_file=os.path.join(
            model_dir, "vocab.txt"),
                                  do_lower_case=True)

        # prepare input
        max_seq_len = 24
        input_str_batch = ["hello, bert!", "how are you doing!"]

        input_ids_batch = []
        token_type_ids_batch = []
        for input_str in input_str_batch:
            input_tokens = tokenizer.tokenize(input_str)
            input_tokens = ["[CLS]"] + input_tokens + ["[SEP]"]

            print("input_tokens len:", len(input_tokens))

            input_ids = tokenizer.convert_tokens_to_ids(input_tokens)
            input_ids = input_ids + [0] * (max_seq_len - len(input_tokens))
            token_type_ids = [0] * len(input_tokens) + [0] * (
                max_seq_len - len(input_tokens))

            input_ids_batch.append(input_ids)
            token_type_ids_batch.append(token_type_ids)

        input_ids = np.array(input_ids_batch, dtype=np.int32)
        token_type_ids = np.array(token_type_ids_batch, dtype=np.int32)

        print("   tokens:", input_tokens)
        print(
            "input_ids:{}/{}:{}".format(len(input_tokens), max_seq_len,
                                        input_ids), input_ids.shape,
            token_type_ids)

        model = CompareBertActivationsTest.load_keras_model(
            model_dir, max_seq_len)
        model.compile(optimizer=keras.optimizers.Adam(),
                      loss=keras.losses.mean_squared_error)

        pres = model.predict([input_ids, token_type_ids
                              ])  # just for fetching the shape of the output
        print("pres:", pres.shape)

        model.fit(x=(input_ids, token_type_ids),
                  y=np.zeros_like(pres),
                  batch_size=2,
                  epochs=2)
def tokenize_document(doc_info: dict, tokenizer: FullTokenizer) -> dict:
    """
    tokenize into sub tokens
    :param doc_info:
    :param tokenizer:
    :return:
    """
    sub_tokens: List[str] = []  # all sub tokens of a document
    sentence_map: List[int] = []  # collected tokenized tokens -> sentence id
    subtoken_map: List[int] = [
    ]  # collected tokenized tokens -> original token id
    word_idx = -1

    for sentence_id, sentence in enumerate(doc_info['sentences']):
        for token in sentence:
            word_idx += 1
            word_tokens = tokenizer.tokenize(token)
            sub_tokens.extend(word_tokens)
            sentence_map.extend([sentence_id] * len(word_tokens))
            subtoken_map.extend([word_idx] * len(word_tokens))

    speakers = {
        subtoken_map.index(word_index): tokenizer.tokenize(speaker)
        for word_index, speaker in doc_info['speakers']
    }
    clusters = [[(subtoken_map.index(start),
                  len(subtoken_map) - 1 - subtoken_map[::-1].index(end))
                 for start, end in cluster]
                for cluster in doc_info['clusters']]
    tokenized_document = {
        'sub_tokens': sub_tokens,
        'sentence_map': sentence_map,
        'subtoken_map': subtoken_map,
        'speakers': speakers,
        'clusters': clusters,
        'doc_key': doc_info['doc_key']
    }
    return tokenized_document
Example #9
0
class BERTTokenizer(BaseTokenizer):
    def __init__(self, vocab_file=None, **kwargs):

        if vocab_file is None:
            raise ValueError(
                'Vocabulary file is required to initialize BERT tokenizer'
            )

        from bert.tokenization import FullTokenizer

        self.tokenizer = FullTokenizer(vocab_file)

    def __call__(self, text):
        return self.tokenizer.tokenize(text)
    def test_direct_keras_to_stock_compare(self):
        from tests.ext.modeling import BertModel, BertConfig, get_assignment_map_from_checkpoint

        bert_config = BertConfig.from_json_file(self.bert_config_file)
        tokenizer = FullTokenizer(
            vocab_file=os.path.join(self.bert_ckpt_dir, "vocab.txt"))

        # prepare input
        max_seq_len = 6
        input_str = "Hello, Bert!"
        input_tokens = tokenizer.tokenize(input_str)
        input_tokens = ["[CLS]"] + input_tokens + ["[SEP]"]
        input_ids = tokenizer.convert_tokens_to_ids(input_tokens)
        input_ids = input_ids + [0] * (max_seq_len - len(input_tokens))
        input_mask = [1] * len(input_tokens) + [0] * (max_seq_len -
                                                      len(input_tokens))
        token_type_ids = [0] * len(input_tokens) + [0] * (max_seq_len -
                                                          len(input_tokens))

        input_ids = np.array([input_ids], dtype=np.int32)
        input_mask = np.array([input_mask], dtype=np.int32)
        token_type_ids = np.array([token_type_ids], dtype=np.int32)

        print("   tokens:", input_tokens)
        print(
            "input_ids:{}/{}:{}".format(len(input_tokens), max_seq_len,
                                        input_ids), input_ids.shape,
            token_type_ids)

        s_res = self.predict_on_stock_model(input_ids, input_mask,
                                            token_type_ids)
        k_res = self.predict_on_keras_model(input_ids, input_mask,
                                            token_type_ids)

        np.set_printoptions(precision=9,
                            threshold=20,
                            linewidth=200,
                            sign="+",
                            floatmode="fixed")
        print("s_res", s_res.shape)
        print("k_res", k_res.shape)

        print("s_res:\n {}".format(s_res[0, :2, :10]), s_res.dtype)
        print("k_res:\n {}".format(k_res[0, :2, :10]), k_res.dtype)

        adiff = np.abs(s_res - k_res).flatten()
        print("diff:", np.max(adiff), np.argmax(adiff))
        self.assertTrue(np.allclose(s_res, k_res, atol=1e-6))
Example #11
0
class BERTTokenizer(BaseTokenizer):
    def __init__(self, vocab_file=None, **kwargs):
        super().__init__()
        if vocab_file is None:
            raise ValueError(
                'Vocabulary file is required to initialize BERT tokenizer')

        try:
            from bert.tokenization import FullTokenizer
        except ImportError:
            raise ValueError(
                "Please install bert-tensorflow: pip install bert-tensorflow")

        self.tokenizer = FullTokenizer(vocab_file)

    def __call__(self, text):
        return ['[CLS]'] + self.tokenizer.tokenize(text) + ['[SEP]']
Example #12
0
def tokenize_single_input(text, tokenizer: btk.FullTokenizer,
                          max_input_length):
    tokens = ['[CLS]']
    tokens += tokenizer.tokenize(text)
    token_ids = tokenizer.convert_tokens_to_ids(tokens)
    token_masks = [1] * len(token_ids)
    segment_ids = [0] * max_input_length

    if len(token_ids) > max_input_length:
        raise ValueError(
            'The input is %i while the maximum input can be only %i.' %
            (len(token_ids), max_input_length))

    while len(token_ids) != max_input_length:
        token_ids.append(0)
        token_masks.append(0)

    return token_ids, token_masks, segment_ids
Example #13
0
def tokenize_data(input_str_batch, max_seq_len, model_dir):
    tokenizer = FullTokenizer(vocab_file=os.path.join(model_dir, "vocab.txt"),
                              do_lower_case=True)
    input_ids_batch = []
    token_type_ids_batch = []
    for input_str in input_str_batch:
        input_tokens = tokenizer.tokenize(input_str)
        input_tokens = ["[CLS]"] + input_tokens + ["[SEP]"]

        print("input_tokens len:", len(input_tokens))

        input_ids = tokenizer.convert_tokens_to_ids(input_tokens)
        if len(input_tokens) > max_seq_len:
            input_ids = input_ids[:max_seq_len]
        else:
            input_ids = input_ids + [0] * (max_seq_len - len(input_tokens))
        # token_type_ids = [0] * len(input_tokens) + [0] * (max_seq_len - len(input_tokens))
        token_type_ids = [0] * max_seq_len
        input_ids_batch.append(input_ids)
        token_type_ids_batch.append(token_type_ids)
    return input_ids_batch, token_type_ids_batch
Example #14
0
class DisasterDetector:
    def __init__(self, bert_layer, max_sql, lr, batch_size, epochs):

        self.bert_layer = bert_layer
        self.max_sql = max_sql
        vocab = self.bert_layer.resolved_object.vocab_file.asset_path.numpy()
        lowercase = self.bert_layer.resolved_object.do_lower_case.numpy()
        self.token = FullTokenizer(vocab, lowercase)
        self.lr = lr
        self.batch_size = batch_size
        self.epochs = epochs
        self.models = []
        self.scores = {}

    def encode(self, texts):

        all_tokens = []
        all_masks = []
        all_segments = []
        for text in texts:
            text = self.token.tokenize(text)
            text = text[:self.max_sql - 2]
            input_seq = ['[CLS]'] + text + ['[SEP]']
            pad_len = self.max_sql - len(input_seq)
            tokens = self.token.convert_tokens_to_ids(input_seq)
            tokens += [0] * pad_len
            pad_masks = [1] * len(input_seq) + [0] * pad_len
            segment_ids = [0] * self.max_sql
            all_tokens.append(tokens)
            all_masks.append(pad_masks)
            all_segments.append(segment_ids)
        return np.array(all_tokens), np.array(all_masks), np.array(
            all_segments)

    def build_model(self):

        input_words = Input(shape=(self.max_sql, ),
                            dtype=tf.int32,
                            name='input_words')
        input_mask = Input(shape=(self.max_sql, ),
                           dtype=tf.int32,
                           name='input_mask')
        segmentids = Input(shape=(self.max_sql, ),
                           dtype=tf.int32,
                           name='segment_ids')
        _, sequence_output = self.bert_layer(
            [input_words, input_mask, segmentids])  # without pooled output
        clf_output = sequence_output[:, 0, :]
        out = Dense(1, activation='sigmoid')(clf_output)

        model = Model(inputs=[input_words, input_mask, segmentids],
                      outputs=out)
        optimizer = Adam(learning_rate=self.lr)
        model.compile(loss='binary_crossentropy',
                      optimizer=optimizer,
                      metrics=['accuracy'])
        return model

    def fit(self, x):
        xtrain, xval, ytrain, yval = train_test_split(x,
                                                      x.target_relabeled,
                                                      test_size=0.2,
                                                      random_state=878)
        ytrain = xtrain.target_relabeled
        xtrain = self.encode(xtrain.cleaned.str.lower())
        yval = xval.target_relabeled
        xval = self.encode(xval.cleaned.str.lower())
        metrics = ClassificationReport(train=(xtrain, ytrain),
                                       val=(xval, yval))
        checkpoint = ModelCheckpoint('model_BERT.h5',
                                     monitor='val_loss',
                                     save_best_only=True)
        model = self.build_model()
        model.fit(xtrain,
                  ytrain,
                  validation_data=(xval, yval),
                  callbacks=[metrics, checkpoint],
                  epochs=self.epochs,
                  batch_size=self.batch_size)

    def predict(self, x):
        model = self.build_model()
        model.load_weights('model_BERT.h5')
        xtest = self.encode(x.cleaned.str.lower())
        ypred = model.predict(xtest)
        return ypred
Example #15
0
class BertPreprocessor(Preprocessor):
    """Preprocessor for BERT embedding.

    This class can be used to do all the work to create the inputs (and outputs) of a Neural Network using BERT
    as embedding. Currently only single sequence classification is supported.

    Source: https://github.com/google-research/bert_keras
    """
    def __init__(self, pretrained_model_path: str, **kwargs):

        super().__init__(**kwargs)

        info = hub.Module(spec=pretrained_model_path)(
            signature="tokenization_info", as_dict=True)

        with tf.Session() as sess:
            vocab_file, do_lower_case = sess.run(
                [info["vocab_file"], info["do_lower_case"]])

        # Create the tokenizer with the vocabulary of the pretrained model
        self._tokenizer = FullTokenizer(vocab_file=vocab_file,
                                        do_lower_case=do_lower_case)

        basic_tokens = self._tokenizer.convert_tokens_to_ids(
            ["[CLS]", "[SEP]"])
        self._CLS_token = basic_tokens[0]
        self._SEP_token = basic_tokens[1]

    def _truncate_seq_pair(self, tokens_a, tokens_b, max_length):
        """Truncates a sequence pair in place to the maximum length."""

        # This is a simple heuristic which will always truncate the longer sequence
        # one token at a time. This makes more sense than truncating an equal percent
        # of tokens from each, since if one sequence is very short then each token
        # that's truncated likely contains more information than a longer sequence.
        while True:
            total_length = len(tokens_a) + len(tokens_b)
            if total_length <= max_length:
                break
            if len(tokens_a) > len(tokens_b):
                tokens_a.pop()
            else:
                tokens_b.pop()

    def _padding_sentence(self):
        """Return a zero length sentence to pad last batch.

        :return: Three sequences of zeros (tokens, masks, segment ids).
        """

        return [0] * self._max_seq_len, [0] * self._max_seq_len, [
            0
        ] * self._max_seq_len

    def tokenize(self, text_a: str, text_b: str = None):
        """Convert sequence(s) of words into sequence(s) of tokens and also compute the masking- and segment ids.

        For further details please read BERT paper.

        :param text_a: First sequence
        :param text_b: Second sequence
        :return: The sequence of tokens, masks and segment ids.
        """

        input_ids = [0] * self._max_seq_len
        # The mask has 1 for real tokens and 0 for padding tokens. Only real tokens are attended to.
        input_mask = [0] * self._max_seq_len
        # The segment ids are 0 for text_a and 1 for text_b
        input_segment_ids = [0] * self._max_seq_len

        tokens_a = self._tokenizer.tokenize(text_a)
        tokens_b = None
        if text_b:
            tokens_b = self._tokenizer.tokenize(text_b)

        if tokens_b:
            # Modifies `tokens_a` and `tokens_b` in place so that the total
            # length is less than the specified length.
            # Account for [CLS], [SEP], [SEP] with "- 3"
            self._truncate_seq_pair(tokens_a, tokens_b, self._max_seq_len - 3)
        else:
            # Account for [CLS] and [SEP] with "- 2"
            if len(tokens_a) > self._max_seq_len - 2:
                tokens_a = tokens_a[0:(self._max_seq_len - 2)]

        idx = 0
        input_ids[idx] = self._CLS_token
        idx += 1

        for element in self._tokenizer.convert_tokens_to_ids(tokens_a):
            input_ids[idx] = element
            input_mask[idx] = 1
            idx += 1

        if tokens_b:
            for element in self._tokenizer.convert_tokens_to_ids(tokens_b):
                input_ids[idx] = element
                input_mask[idx] = 1
                input_segment_ids[idx] = 1
                idx += 1

        input_ids[idx] = self._SEP_token

        # The mask has 1 for real tokens and 0 for padding tokens. Only real tokens are attended to.
        for i in range(idx + 1):
            input_mask[i] = 1

        # safety check
        assert len(input_ids) == self._max_seq_len
        assert len(input_mask) == self._max_seq_len
        assert len(input_segment_ids) == self._max_seq_len

        return input_ids, input_mask, input_segment_ids

    def fit(self, texts: List[str]) -> 'BertPreprocessor':
        """This function does nothing in case of BERT but must be implemented.

        :param texts: -
        :return: self
        """

        return self

    def transform(self, examples: List[InputExample]) -> list:
        """Transform sequences of words into sequences of tokens, masks and segment ids.

        Masks are used to separate valid and padding tokens. Here the segment ids are always one since the whole
        sequence belongs together.

        For further details please read BERT paper.

        :param texts: The sequences of texts.
        :return: The sequences of tokens, masks and segment ids.
        """

        input_ids, input_masks, segment_ids = [], [], []

        for i, example in enumerate(examples):
            input_id, input_mask, segment_id = self.tokenize(
                text_a=example.text_a, text_b=example.text_b)
            input_ids.append(input_id)
            input_masks.append(input_mask)
            segment_ids.append(segment_id)

        return [
            np.array(input_ids),
            np.array(input_masks),
            np.array(segment_ids)
        ]

    def inverse_transform(self, sequences: np.ndarray):
        """Transform sequences of tokens back to sequences of words (sentences).

        :param sequences: The sequences of tokens.
        :return: The sequences of words
        """

        return self._tokenizer.convert_ids_to_tokens(sequences)
Example #16
0
class BertNer(object):
    def __init__(self, **kwargs):
        self.tf = import_tf(kwargs['gpu_no'], kwargs['verbose'])
        self.logger = set_logger('BertNer', kwargs['log_dir'],
                                 kwargs['verbose'])
        self.model_dir = kwargs['ner_model']

        from bert.tokenization import FullTokenizer
        self.tokenizer = FullTokenizer(
            os.path.join(self.model_dir, 'vocab.txt'))

        self.ner_sq_len = 128
        self.input_ids = self.tf.placeholder(self.tf.int32,
                                             (None, self.ner_sq_len),
                                             'input_ids')
        self.input_mask = self.tf.placeholder(self.tf.int32,
                                              (None, self.ner_sq_len),
                                              'input_mask')

        # init graph
        self._init_graph()

        # init ner assist data
        self._init_predict_var()

        self.per_proun = [
            '甲', '乙', '丙', '丁', '戊', '己', '庚', '辛', '壬', '癸', '子', '丑', '寅',
            '卯', '辰', '巳', '午', '未', '申', '酉', '戌', '亥'
        ]

    def _init_graph(self):
        """
        init bert ner graph
        :return:
        """
        try:
            with self.tf.gfile.GFile(
                    os.path.join(self.model_dir, 'ner_model.pb'), 'rb') as f:
                graph_def = self.tf.GraphDef()
                graph_def.ParseFromString(f.read())
                input_map = {
                    "input_ids:0": self.input_ids,
                    'input_mask:0': self.input_mask
                }

                self.pred_ids = self.tf.import_graph_def(
                    graph_def,
                    name='',
                    input_map=input_map,
                    return_elements=['pred_ids:0'])[0]
                graph = self.pred_ids.graph

                sess_config = self.tf.ConfigProto(allow_soft_placement=True)
                sess_config.gpu_options.allow_growth = True

                self.sess = self.tf.Session(graph=graph, config=sess_config)
                self.sess.run(self.tf.global_variables_initializer())
                self.tf.reset_default_graph()

        except Exception as e:
            self.logger.error(e)

    def _init_predict_var(self):
        """
        initialize assist of bert ner
        :return: labels num of ner, label to id dict, id to label dict
        """
        with open(os.path.join(self.model_dir, 'label2id.pkl'), 'rb') as rf:
            self.id2label = {
                value: key
                for key, value in pickle.load(rf).items()
            }

    def _convert_lst_to_features(self,
                                 lst_str,
                                 is_tokenized=True,
                                 mask_cls_sep=False):
        """
        Loads a data file into a list of `InputBatch`s.
        :param lst_str: list str
        :param is_tokenized: whether token unknown word
        :param mask_cls_sep: masking the embedding on [CLS] and [SEP] with zero.
        :return: input feature instance
        """
        from bert.extract_features import read_tokenized_examples, read_examples, InputFeatures

        examples = read_tokenized_examples(
            lst_str) if is_tokenized else read_examples(lst_str)

        _tokenize = lambda x: self.tokenizer.mark_unk_tokens(
            x) if is_tokenized else self.tokenizer.tokenize(x)

        for (ex_index, example) in enumerate(examples):
            tokens_a = _tokenize(example.text_a)

            tokens_b = None
            if example.text_b:
                tokens_b = _tokenize(example.text_b)

            if tokens_b:
                # Modifies `tokens_a` and `tokens_b` in place so that the total
                # length is less than the specified length.
                # Account for [CLS], [SEP], [SEP] with "- 3"
                self._truncate_seq_pair(tokens_a, tokens_b)
            else:
                # Account for [CLS] and [SEP] with "- 2"
                if len(tokens_a) > self.ner_sq_len - 2:
                    tokens_a = tokens_a[0:(self.ner_sq_len - 2)]

            # The convention in BERT is:
            # (a) For sequence pairs:
            #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
            #  type_ids: 0     0  0    0    0     0       0 0     1  1  1  1   1 1
            # (b) For single sequences:
            #  tokens:   [CLS] the dog is hairy . [SEP]
            #  type_ids: 0     0   0   0  0     0 0
            #
            # Where "type_ids" are used to indicate whether this is the first
            # sequence or the second sequence. The embedding vectors for `type=0` and
            # `type=1` were learned during pre-training and are added to the wordpiece
            # embedding vector (and position vector). This is not *strictly* necessary
            # since the [SEP] token unambiguously separates the sequences, but it makes
            # it easier for the model to learn the concept of sequences.
            #
            # For classification tasks, the first vector (corresponding to [CLS]) is
            # used as as the "sentence vector". Note that this only makes sense because
            # the entire model is fine-tuned.
            tokens = ['[CLS]'] + tokens_a + ['[SEP]']
            input_type_ids = [0] * len(tokens)
            input_mask = [int(not mask_cls_sep)
                          ] + [1] * len(tokens_a) + [int(not mask_cls_sep)]

            if tokens_b:
                tokens += tokens_b + ['[SEP]']
                input_type_ids += [1] * (len(tokens_b) + 1)
                input_mask += [1] * len(tokens_b) + [int(not mask_cls_sep)]

            input_ids = self.tokenizer.convert_tokens_to_ids(tokens)

            # Zero-pad up to the sequence length. more pythonic
            pad_len = self.ner_sq_len - len(input_ids)
            input_ids += [0] * pad_len
            input_mask += [0] * pad_len
            input_type_ids += [0] * pad_len

            assert len(input_ids) == self.ner_sq_len
            assert len(input_mask) == self.ner_sq_len
            assert len(input_type_ids) == self.ner_sq_len

            yield InputFeatures(input_ids=input_ids,
                                input_mask=input_mask,
                                input_type_ids=input_type_ids)

    def _truncate_seq_pair(self, tokens_a, tokens_b):
        """
        Truncates a sequence pair in place to the maximum length.
        :param tokens_a: text a
        :param tokens_b: text b
        """
        try:
            while True:
                total_length = len(tokens_a) + len(tokens_b)

                if total_length <= self.ner_sq_len - 3:
                    break
                if len(tokens_a) > len(tokens_b):
                    tokens_a.pop()
                else:
                    tokens_b.pop()
        except:
            self.logger.error()

    def _convert_id_to_label(self, pred_ids_result, batch_size):
        """
        turn id to label
        :param pred_ids_result: predict result
        :param batch_size: batch size of predict ids result
        :return: label list
        """
        result = []
        index_result = []
        for row in range(batch_size):
            curr_seq = []
            curr_idx = []
            ids = pred_ids_result[row]
            for idx, id in enumerate(ids):
                if id == 0:
                    break
                curr_label = self.id2label[id]
                if curr_label in ['[CLS]', '[SEP]']:
                    if id == 102 and (idx < len(ids) and ids[idx + 1] == 0):
                        break
                    continue
                # elif curr_label == '[SEP]':
                #     break
                curr_seq.append(curr_label)
                curr_idx.append(id)
            result.append(curr_seq)
            index_result.append(curr_idx)
        return result, index_result

    def predict(self, contents):
        """
        bert ner predict
        :param content_list: content list
        :return: predict result
        """
        try:
            splited_contents = []
            all_terms = []
            for content in contents:
                content_len = len(content)
                if content_len % self.ner_sq_len - 2 == 0:
                    terms = int(content_len / (self.ner_sq_len - 2))
                else:
                    terms = int(content_len / (self.ner_sq_len - 2)) + 1
                all_terms.append(terms)

                for i in range(terms):
                    splited_contents.append(
                        content[i * (self.ner_sq_len - 2):(i + 1) *
                                (self.ner_sq_len - 2)])

            tmp_f = list(self._convert_lst_to_features(splited_contents))
            input_ids = [f.input_ids for f in tmp_f]
            input_masks = [f.input_mask for f in tmp_f]

            pred_result = self.sess.run(self.pred_ids,
                                        feed_dict={
                                            self.input_ids: input_ids,
                                            self.input_mask: input_masks
                                        })

            # restore to original string
            tmp = []
            index = 0
            for terms in all_terms:
                sub_preds = []
                for i in range(terms):
                    sub_preds.extend(pred_result[index + i])
                tmp.append(sub_preds)
                index += terms

            pred_result = tmp

            pred_result = self._convert_id_to_label(pred_result,
                                                    len(pred_result))[0]

            # zip str predict id
            str_pred = []
            for w in zip(contents, pred_result):
                sub_list = []
                for z in zip(list(w[0]), w[1]):
                    sub_list.append([z[0], z[1]])

                str_pred.append(sub_list)

            # get ner
            ner_result = [self._combine_ner(s) for s in str_pred]
            return ner_result

        except Exception as e:
            self.logger.error(e)
            return [[]]

    def _combine_ner(self, pred_result):
        """
        combine ner
        :param pred_result: model predict result and origin content words list
        :return: entity words and index
        """
        words_len = len(pred_result)
        i = 0
        tmp = ''
        _ner_list = []

        while i < words_len:
            word = pred_result[i]
            # add personal pronoun
            if word[0] in self.per_proun and word[1][0] == 'O':
                _ner_list.append([word[0], 'PER'])

            if word[1][0] == 'O' and tmp is not '':
                _ner_list.append([tmp, pred_result[i - 1][1][2:]])
                tmp = ''

            elif word[1][0] == 'I':
                tmp = tmp + word[0]
                if i == words_len - 1:
                    _ner_list.append([tmp, word[1][2:]])

            elif word[1][0] == 'B':
                if tmp is not '':
                    _ner_list.append([tmp, pred_result[i - 1][1][2:]])

                tmp = word[0]
                if i == words_len - 1:
                    _ner_list.append([tmp, word[1][2:]])

            i += 1

        return _ner_list
Example #17
0
def progSuccess():
    global stProg
    text = request.form['progTextField']
    s = text[0:512]

    final_model = keras.models.load_model(
        '/home/suiSense/my_site/final_regular_model.h5')

    realSuicidal = "According to our algorithm, the text has been classified as suicidal."
    realDepression = "According to our algorithm, the text has been classified as depression, not suicidal."

    max_seq_length = 512  # Your choice here.
    input_word_ids = tf.keras.layers.Input(shape=(max_seq_length, ),
                                           dtype=tf.int32,
                                           name="input_word_ids")
    input_mask = tf.keras.layers.Input(shape=(max_seq_length, ),
                                       dtype=tf.int32,
                                       name="input_mask")
    segment_ids = tf.keras.layers.Input(shape=(max_seq_length, ),
                                        dtype=tf.int32,
                                        name="segment_ids")
    bert_layer = hub.KerasLayer(
        "https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1",
        trainable=True)
    pooled_output, sequence_output = bert_layer(
        [input_word_ids, input_mask, segment_ids])

    model = Model(inputs=[input_word_ids, input_mask, segment_ids],
                  outputs=[pooled_output, sequence_output])

    # See BERT paper: https://arxiv.org/pdf/1810.04805.pdf
    # And BERT implementation convert_single_example() at https://github.com/google-research/bert/blob/master/run_classifier.py

    vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
    do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()

    tokenizer = FullTokenizer(vocab_file, do_lower_case)

    stokens = tokenizer.tokenize(s)

    stokens = ["[CLS]"] + stokens + ["[SEP]"]

    input_ids = get_ids(stokens, tokenizer, max_seq_length)
    input_masks = get_masks(stokens, max_seq_length)
    input_segments = get_segments(stokens, max_seq_length)

    pool_embs, all_embs = model.predict([[input_ids], [input_masks],
                                         [input_segments]])

    predictions = final_model.predict(pool_embs)

    predictionPercentage = predictions[0][0] * 100

    if (0.0 < predictions <= 0.500):
        return render_template("progressionSuccess.html",
                               contents="depression",
                               intvar=predictionPercentage)

    elif (0.500 < predictions <= 1.000):
        return render_template("progressionSuccess.html",
                               contents="suicidal",
                               intvar=predictionPercentage)

    try:
        reloadWebsite()
    except:
        print('reload failed')
Example #18
0
from bert.tokenization import FullTokenizer
from sacremoses import MosesTokenizer, MosesDetokenizer
from sequence_transfer.sequence import CharSequence, TokenSequence
from sequence_transfer.plugin.entity_annotation_transfer_plugin import EntityAnnotationTransferPlugin, \
    EntityAnnotationSequence
from sequence_transfer.magic_transfer import MagicTransfer


# We create a char sequence sequence
text = "  J'adore  Zoé!  "
char_sequence = CharSequence.new(text)

# We create the token sequence
tokenizer = FullTokenizer('vocab.txt')
tokens = tokenizer.tokenize(text)
print(tokens)

tokenizer = MosesTokenizer('fr')
tokens = tokenizer.tokenize(text)
print(tokens)

detokenizer = MosesDetokenizer('fr')
y = detokenizer.detokenize(tokens)

print(y)

exit()
token_sequence = TokenSequence.new(tokens)

# We create a magic transfer
transfer = MagicTransfer(char_sequence, token_sequence)
Example #19
0
class BERTEmbeddingEvaluator(SentenceEmbeddingEvaluator):
    def __init__(
            self,
            model_fname="/notebooks/embedding/data/sentence-embeddings/bert/tune-ckpt",
            bertconfig_fname="/notebooks/embedding/data/sentence-embeddings/bert/multi_cased_L-12_H-768_A-12/bert_config.json",
            vocab_fname="/notebooks/embedding/data/sentence-embeddings/bert/multi_cased_L-12_H-768_A-12/vocab.txt",
            max_seq_length=32,
            dimension=768,
            num_labels=2,
            use_notebook=False):

        super().__init__("bert", dimension, use_notebook)
        config = BertConfig.from_json_file(bertconfig_fname)
        self.max_seq_length = max_seq_length
        self.tokenizer = FullTokenizer(vocab_file=vocab_fname,
                                       do_lower_case=False)
        self.model, self.input_ids, self.input_mask, self.segment_ids, self.probs = make_bert_graph(
            config, max_seq_length, 1.0, num_labels, tune=False)
        saver = tf.train.Saver(tf.global_variables())
        self.sess = tf.Session()
        checkpoint_path = tf.train.latest_checkpoint(model_fname)
        saver.restore(self.sess, checkpoint_path)

    def predict(self, sentence):
        tokens = self.tokenize(sentence)
        model_input = self.make_input(tokens)
        probs = self.sess.run(self.probs, model_input)
        return probs

    """
    sentence를 입력하면 토크나이즈 결과와 token 벡터 시퀀스를 반환한다
        - shape :[[# of tokens], [batch size, max seq length, dimension]]
    """

    def get_token_vector_sequence(self, sentence):
        tokens = self.tokenize(sentence)
        model_input = self.make_input(tokens)
        return [
            tokens,
            self.sess.run(self.model.get_sequence_output()[0],
                          model_input)[:len(tokens) + 2]
        ]

    """
    sentence를 입력하면 토크나이즈 결과와 [CLS] 벡터를 반환한다
         - shape :[[# of tokens], [batch size, dimension]]
    """

    def get_sentence_vector(self, sentence):
        tokens = self.tokenize(sentence)
        model_input = self.make_input(tokens)
        return [
            tokens,
            self.sess.run(self.model.pooled_output, model_input)[0]
        ]

    """
    sentence를 입력하면 토크나이즈 결과와 self-attention score matrix를 반환한다
        - shape :[[# of tokens], [batch size, # of tokens, # of tokens]]
    """

    def get_self_attention_score(self, sentence):
        tokens = self.tokenize(sentence)
        model_input = self.make_input(tokens)
        # raw_score : shape=[# of layers, batch_size, num_attention_heads, max_seq_length, max_seq_length]
        raw_score = self.sess.run(self.model.attn_probs_for_visualization_list,
                                  model_input)
        # 마지막 레이어를 취한 뒤, attention head 기준(axis=0)으로 sum
        scores = np.sum(raw_score[-1][0], axis=0)
        # scores matrix에서 토큰 개수만큼 취함
        scores = scores[:len(tokens), :len(tokens)]
        return [tokens, scores]

    def tokenize(self, sentence):
        return self.tokenizer.tokenize(convert_to_unicode(sentence))

    def make_input(self, tokens):
        tokens = tokens[:(self.max_seq_length - 2)]
        token_sequence = ["[CLS]"] + tokens + ["[SEP]"]
        segment = [0] * len(token_sequence)
        sequence = self.tokenizer.convert_tokens_to_ids(token_sequence)
        current_length = len(sequence)
        padding_length = self.max_seq_length - current_length
        input_feed = {
            self.input_ids:
            np.array([sequence + [0] * padding_length]),
            self.segment_ids:
            np.array([segment + [0] * padding_length]),
            self.input_mask:
            np.array([[1] * current_length + [0] * padding_length])
        }
        return input_feed

    def visualize_self_attention_scores(self, sentence):
        tokens, scores = self.get_self_attention_score(sentence)
        visualize_self_attention_scores(tokens,
                                        scores,
                                        use_notebook=self.use_notebook)
Example #20
0
def advancedProgSuccess():
    global stProg
    #bringing in the text
    baseline_text = request.form['baselineOne'] + ' ' + request.form[
        'baselineTwo'] + ' ' + request.form['baselineThree']
    final_text = request.form['recentOne'] + ' ' + request.form[
        'recentTwo'] + ' ' + request.form['recentThree']

    #text truncation for bert
    baseline_text = baseline_text[0:512]
    final_text = final_text[0:512]

    #initializing models
    final_model = keras.models.load_model(
        '/home/suiSense/my_site/final_regular_model.h5')
    baseline_model = keras.models.load_model(
        '/home/suiSense/my_site/baseline_model.h5')

    #bringing in the bert model to apply for all the text
    max_seq_length = 512
    input_word_ids = tf.keras.layers.Input(shape=(max_seq_length, ),
                                           dtype=tf.int32,
                                           name="input_word_ids")
    input_mask = tf.keras.layers.Input(shape=(max_seq_length, ),
                                       dtype=tf.int32,
                                       name="input_mask")
    segment_ids = tf.keras.layers.Input(shape=(max_seq_length, ),
                                        dtype=tf.int32,
                                        name="segment_ids")
    bert_layer = hub.KerasLayer(
        "https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1",
        trainable=True)
    pooled_output, sequence_output = bert_layer(
        [input_word_ids, input_mask, segment_ids])
    model = Model(inputs=[input_word_ids, input_mask, segment_ids],
                  outputs=[pooled_output, sequence_output])
    vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
    do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
    tokenizer = FullTokenizer(vocab_file, do_lower_case)

    #baseline model, baseline text
    stokensOne = tokenizer.tokenize(baseline_text)
    stokensOne = ["[CLS]"] + stokensOne + ["[SEP]"]
    input_ids = get_ids(stokensOne, tokenizer, max_seq_length)
    input_masks = get_masks(stokensOne, max_seq_length)
    input_segments = get_segments(stokensOne, max_seq_length)
    pool_embs, all_embs = model.predict([[input_ids], [input_masks],
                                         [input_segments]])
    fxOne = baseline_model.predict(pool_embs)
    fxOne = fxOne[0][0]

    #baseline model, suicidal text
    stokensTwo = tokenizer.tokenize(final_text)
    stokensTwo = ["[CLS]"] + stokensTwo + ["[SEP]"]
    input_ids = get_ids(stokensTwo, tokenizer, max_seq_length)
    input_masks = get_masks(stokensTwo, max_seq_length)
    input_segments = get_segments(stokensTwo, max_seq_length)
    pool_embs, all_embs = model.predict([[input_ids], [input_masks],
                                         [input_segments]])
    fxTwo = baseline_model.predict(pool_embs)
    fxTwo = fxTwo[0][0]

    #suicidal model, baseline text
    stokensThree = tokenizer.tokenize(baseline_text)
    stokensThree = ["[CLS]"] + stokensThree + ["[SEP]"]
    input_ids = get_ids(stokensThree, tokenizer, max_seq_length)
    input_masks = get_masks(stokensThree, max_seq_length)
    input_segments = get_segments(stokensThree, max_seq_length)
    pool_embs, all_embs = model.predict([[input_ids], [input_masks],
                                         [input_segments]])
    gxOne = final_model.predict(pool_embs)
    gxOne = gxOne[0][0]

    #suicidal model, suicidal text
    stokensFour = tokenizer.tokenize(final_text)
    stokensFour = ["[CLS]"] + stokensFour + ["[SEP]"]
    input_ids = get_ids(stokensFour, tokenizer, max_seq_length)
    input_masks = get_masks(stokensFour, max_seq_length)
    input_segments = get_segments(stokensFour, max_seq_length)
    pool_embs, all_embs = model.predict([[input_ids], [input_masks],
                                         [input_segments]])
    gxTwo = final_model.predict(pool_embs)
    gxTwo = gxTwo[0][0]

    if (fxTwo > 0.5):
        if (fxOne > 0.5):
            predictionPercentage = (gxTwo - gxOne) * 100
        else:
            predictionPercentage = ((gxTwo + 1) - fxOne) * 100
    else:
        if (fxOne > 0.5):
            predictionPercentage = (fxTwo - (gxOne + 1)) * 100
        else:
            predictionPercentage = (fxTwo - fxOne) * 100

    significant_digits = 3
    predictionPercentage = round(
        predictionPercentage, significant_digits -
        int(math.floor(math.log10(abs(predictionPercentage)))) - 1)

    absPredictionPercentage = abs(predictionPercentage)

    return render_template("advancedProgressionSuccess.html",
                           intvar=predictionPercentage,
                           absintvar=absPredictionPercentage,
                           fxOne=fxOne,
                           fxTwo=fxTwo,
                           gxOne=gxOne,
                           gxTwo=gxTwo)

    try:
        reloadWebsite()
    except:
        print('reload failed')
class TrainDataReader():
    def __init__(self, config, category_dir, vocab_file):
        self.config = config
        self.category_dir = category_dir
        self.tokenizer = FullTokenizer(vocab_file)
        if not os.path.exists(
                os.path.join(self.category_dir, 'train_data', 'raw.csv')):
            raise Exception("local raw train data not exists!!")
        if not os.path.exists(vocab_file):
            raise Exception("local vocab_file not exists")

    def transform(self):
        with open(os.path.join(self.category_dir, 'train_data', 'raw.csv')) as fr, \
             open(os.path.join(self.category_dir, 'attr_values.pkl'), 'wb') as fwa:
            attr_values_c = {}
            for row in fr:
                if row.strip() == '' or len(row.strip().split('\t')) != 10:
                    continue
                segment = row.strip().split('\t')
                attr_values_c[(segment[8], segment[9])] = 1
            attr_values = {k: i for i, k in enumerate(attr_values_c.keys())}
            attr_values_r = {i: k for k, i in attr_values.items()}
            print('start to write local attr_values.pkl!!')
            pickle.dump((attr_values, attr_values_r), fwa)
        with open(os.path.join(self.category_dir, 'train_data', 'raw.csv')) as fr, \
             open(os.path.join(self.category_dir, 'train_data', 'transform.csv'), 'w') as fwt:
            print('start to write local train_data transform.csv!!')
            for row in fr:
                if row.strip() == '' or len(row.strip().split('\t')) != 10:
                    continue
                segment = row.strip().split('\t')
                label = attr_values[(segment[8], segment[9])]
                tokens = self.tokenizer.tokenize(segment[7])
                if len(tokens) > self.config.max_seq_length - 2:
                    tokens = tokens[0:self.config.max_seq_length - 2]
                tokens = ['[CLS]'] + tokens + ['[SEP]']
                token_ids = self.tokenizer.convert_tokens_to_ids(tokens)
                token_ids_patch = token_ids[0:self.config.max_seq_length] + [
                    0
                ] * (self.config.max_seq_length - len(token_ids))
                token_ids_patch = list(map(lambda x: str(x), token_ids_patch))
                fwt.write(
                    str(label) + ',' +
                    str(min(len(token_ids), len(token_ids_patch))) + ',' +
                    ','.join(token_ids_patch) + '\n')
        return len(attr_values)

    def read(self):
        transform = os.path.join(self.category_dir, 'train_data',
                                 'transform.csv')
        queue = tf.train.string_input_producer([transform])
        reader = tf.TextLineReader()
        _, value = reader.read(queue)
        row = tf.decode_csv(value, [[0]] * (self.config.max_seq_length + 2))
        label = tf.stack(row[0])
        length = tf.stack(row[1])
        mask = tf.cast(tf.sequence_mask(length, self.config.max_seq_length),
                       tf.int32)
        sequence = tf.stack(row[2:self.config.max_seq_length + 2])
        return tf.train.shuffle_batch([label, sequence, mask],
                                      self.config.batch_size, 50000, 10000)
Example #22
0
    input_file
)  # [:1] # array of lists [text, target_word_idx, correct_word_idx]
with open('100_texts', 'w') as f:
    for example in data:
        f.write(example[0])

bert_tokens = []
token_map = []
tokenizer = FullTokenizer(vocab_file=bert + 'vocab.txt', do_lower_case=False)

for text in data:
    text_tokens = ['[CLS]']
    text_map = []
    for word in text[0].split(' '):
        text_map.append(len(text_tokens))
        text_tokens.extend(tokenizer.tokenize(word))

    token_map.append(text_map)
    bert_tokens.append(text_tokens)

args = ['python', '../bert/extract_features.py']
args.append('--input_file=100_texts')
args.append('--output_file=anaphora')
args.append('--vocab_file=' + bert + 'vocab.txt')
args.append('--bert_config_file=' + bert + 'bert_config.json')
args.append('--init_checkpoint=' + bert + 'bert_model.ckpt')
args.append('--layers=' + layers)
args.append('--max_seq_length=128')
args.append('--batch_size=8')
args.append('--do_lower_case=False')
args.append('--attention=True')
Example #23
0
class Tuner(object):

    def __init__(self, train_corpus_fname=None, tokenized_train_corpus_fname=None,
                 test_corpus_fname=None, tokenized_test_corpus_fname=None,
                 model_name="bert", model_save_path=None, vocab_fname=None, eval_every=1000,
                 batch_size=32, num_epochs=10, dropout_keep_prob_rate=0.9, model_ckpt_path=None,
                 sp_model_path=None):
        # configurations
        tf.logging.set_verbosity(tf.logging.INFO)
        self.model_name = model_name
        self.eval_every = eval_every
        self.model_ckpt_path = model_ckpt_path
        self.model_save_path = model_save_path
        self.batch_size = batch_size
        self.num_epochs = num_epochs
        self.dropout_keep_prob_rate = dropout_keep_prob_rate
        self.best_valid_score = 0.0
        if not os.path.exists(model_save_path):
            os.mkdir(model_save_path)
        # define tokenizer
        if self.model_name == "bert":
            self.tokenizer = FullTokenizer(vocab_file=vocab_fname, do_lower_case=False)
        elif self.model_name == "xlnet":
            sp = spm.SentencePieceProcessor()
            sp.Load(sp_model_path)
            self.tokenizer = sp
        else:
            self.tokenizer = get_tokenizer("mecab")
        # load or tokenize corpus
        self.train_data, self.train_data_size = self.load_or_tokenize_corpus(train_corpus_fname, tokenized_train_corpus_fname)
        self.test_data, self.test_data_size = self.load_or_tokenize_corpus(test_corpus_fname, tokenized_test_corpus_fname)

    def load_or_tokenize_corpus(self, corpus_fname, tokenized_corpus_fname):
        data_set = []
        if os.path.exists(tokenized_corpus_fname):
            tf.logging.info("load tokenized corpus : " + tokenized_corpus_fname)
            with open(tokenized_corpus_fname, 'r') as f1:
                for line in f1:
                    tokens, label = line.strip().split("\u241E")
                    if len(tokens) > 0:
                        data_set.append([tokens.split(" "), int(label)])
        else:
            tf.logging.info("tokenize corpus : " + corpus_fname + " > " + tokenized_corpus_fname)
            with open(corpus_fname, 'r') as f2:
                next(f2)  # skip head line
                for line in f2:
                    sentence, label = line.strip().split("\u241E")
                    if self.model_name == "bert":
                        tokens = self.tokenizer.tokenize(sentence)
                    elif self.model_name == "xlnet":
                        normalized_sentence = preprocess_text(sentence, lower=False)
                        tokens = encode_pieces(self.tokenizer, normalized_sentence, return_unicode=False, sample=False)
                    else:
                        tokens = self.tokenizer.morphs(sentence)
                        tokens = post_processing(tokens)
                    if int(label) > 0.5:
                        int_label = 1
                    else:
                        int_label = 0
                    data_set.append([tokens, int_label])
            with open(tokenized_corpus_fname, 'w') as f3:
                for tokens, label in data_set:
                    f3.writelines(' '.join(tokens) + "\u241E" + str(label) + "\n")
        return data_set, len(data_set)

    def train(self, sess, saver, global_step, output_feed):
        train_batches = self.get_batch(self.train_data, num_epochs=self.num_epochs, is_training=True)
        checkpoint_loss = 0.0
        for current_input_feed in train_batches:
            _, _, _, current_loss = sess.run(output_feed, current_input_feed)
            checkpoint_loss += current_loss
            if global_step.eval(sess) % self.eval_every == 0:
                tf.logging.info("global step %d train loss %.4f" %
                                (global_step.eval(sess), checkpoint_loss / self.eval_every))
                checkpoint_loss = 0.0
                self.validation(sess, saver, global_step)

    def validation(self, sess, saver, global_step):
        valid_loss, valid_pred, valid_num_data = 0, 0, 0
        output_feed = [self.logits, self.loss]
        test_batches = self.get_batch(self.test_data, num_epochs=1, is_training=False)
        for current_input_feed, current_labels in test_batches:
            current_logits, current_loss = sess.run(output_feed, current_input_feed)
            current_preds = np.argmax(current_logits, axis=-1)
            valid_loss += current_loss
            valid_num_data += len(current_labels)
            for pred, label in zip(current_preds, current_labels):
                if pred == label:
                    valid_pred += 1
        valid_score = valid_pred / valid_num_data
        tf.logging.info("valid loss %.4f valid score %.4f" %
                        (valid_loss, valid_score))
        if valid_score > self.best_valid_score:
            self.best_valid_score = valid_score
            path = self.model_save_path + "/" + str(valid_score)
            saver.save(sess, path, global_step=global_step)

    def get_batch(self, data, num_epochs, is_training=True):
        if is_training:
            data_size = self.train_data_size
        else:
            data_size = self.test_data_size
        num_batches_per_epoch = int((data_size - 1) / self.batch_size)
        if is_training:
            tf.logging.info("num_batches_per_epoch : " + str(num_batches_per_epoch))
        for epoch in range(num_epochs):
            idx = random.sample(range(data_size), data_size)
            data = np.array(data)[idx]
            for batch_num in range(num_batches_per_epoch):
                batch_sentences = []
                batch_labels = []
                start_index = batch_num * self.batch_size
                end_index = (batch_num + 1) * self.batch_size
                features = data[start_index:end_index]
                for feature in features:
                    sentence, label = feature
                    batch_sentences.append(sentence)
                    batch_labels.append(int(label))
                yield self.make_input(batch_sentences, batch_labels, is_training)

    def make_input(self, sentences, labels, is_training):
        raise NotImplementedError

    def tune(self):
        raise NotImplementedError
Example #24
0
class BertPreprocessor(Preprocessor):
    """Preprocessor for BERT embedding.

    This class can be used to do all the work to create the inputs (and outputs) of a Neural Network using BERT
    as embedding. Currently only single sequence classification is supported.
    """

    def __init__(self,
                 pretrained_model_path: str,
                 **kwargs):
        super().__init__(**kwargs)

        info = hub.Module(spec=pretrained_model_path)(signature="tokenization_info", as_dict=True)

        with tf.Session() as sess:
            vocab_file, do_lower_case = sess.run(
                [
                    info["vocab_file"],
                    info["do_lower_case"]
                ]
            )

        # Create the tokenizer with the vocabulary of the pretrained model
        self._tokenizer = FullTokenizer(vocab_file=vocab_file, do_lower_case=do_lower_case)

        basic_tokens = self._tokenizer.convert_tokens_to_ids(["[CLS]", "[SEP]"])
        self._CLS_token = basic_tokens[0]
        self._SEP_token = basic_tokens[1]

    def _padding_sentence(self):
        """Return a zero length sentence to pad last batch.

        :return: Three sequences of zeros (tokens, masks, segment ids).
        """

        return [0] * self._max_seq_len, [0] * self._max_seq_len, [0] * self._max_seq_len

    def tokenize(self, text: str):
        """Convert a sequence of words into a sequence of tokens and also compute the masking- and segment ids.

        For further details please read BERT paper.

        :param text: The sequence of words.
        :return: The sequence of tokens, masks and segment ids.
        """

        input_ids = [0] * self._max_seq_len
        input_mask = [0] * self._max_seq_len
        input_segment_ids = [0] * self._max_seq_len

        tokens_input = self._tokenizer.tokenize(text)

        # if too long cut to size (the first token will be [CLS], the last [SEP])
        if len(tokens_input) > self._max_seq_len - 2:
            tokens_input = tokens_input[0: (self._max_seq_len - 2)]

        idx = 0
        input_ids[idx] = self._CLS_token
        idx += 1

        for element in self._tokenizer.convert_tokens_to_ids(tokens_input):
            input_ids[idx] = element
            idx += 1

        input_ids[idx] = self._SEP_token

        # The mask has 1 for real tokens and 0 for padding tokens. Only real tokens are attended to.
        for i in range(idx + 1):
            input_mask[i] = 1

        # safety check
        assert len(input_ids) == self._max_seq_len
        assert len(input_mask) == self._max_seq_len
        assert len(input_segment_ids) == self._max_seq_len

        return input_ids, input_mask, input_segment_ids

    def fit(self, texts: List[str]) -> 'BertPreprocessor':
        """This function does nothing in case of BERT but must be implemented.

        :param texts: -
        :return: self
        """

        return self

    def transform(self, texts: List[str]) -> list:
        """Transform sequences of words into sequences of tokens, masks and segment ids.

        Masks are used to separate valid and padding tokens. Here the segment ids are always one since the whole
        sequence belongs together.

        For further details please read BERT paper.

        :param texts: The sequences of texts.
        :return: The sequences of tokens, masks and segment ids.
        """
        
        input_masks = np.empty([len(texts), self._max_seq_len], dtype=np.int64)
        segment_ids = np.empty([len(texts), self._max_seq_len], dtype=np.int64)

        # input_ids, input_masks, segment_ids = [], [], []
        
        input_ids, input_masks, segment_ids = zip(*Pool(processes=8).map(self.tokenize, texts))

        # for i, text in enumerate(texts):
            # input_ids[i], input_masks[i], segment_ids[i] = self.tokenize(text=text)
            # input_id, input_mask, segment_id = self.tokenize(text=text)
            # input_ids.append(input_id)
            # input_masks.append(input_mask)
            # segment_ids.append(segment_id)

        # return [np.array(input_ids), np.array(input_masks), np.array(segment_ids)]
        return [input_ids, input_masks, segment_ids]

    def inverse_transform(self, sequences: np.ndarray):
        """Transform sequences of tokens back to sequences of words (sentences).

        :param sequences: The sequences of tokens.
        :return: The sequences of words
        """

        return self._tokenizer.convert_ids_to_tokens(sequences)
class EntityInfer(LoadModelBase):
    def __init__(self,
                 vocab_file,
                 export_dir=None,
                 url=None,
                 model_name='models',
                 signature_name=None,
                 do_lower_case=True):
        super(EntityInfer, self).__init__(export_dir, url, model_name,
                                          signature_name)
        self.tokenizer = FullTokenizer(vocab_file=vocab_file,
                                       do_lower_case=do_lower_case)
        # 通过 grpc
        if url:
            self.stub, self.request = self.load_grpc_connect()

        if export_dir:
            self.predict_fn = self.load_pb_model()

        self.id_map_predicate = self.id_to_label(model_config.PREDICATE_LABEL)
        self.predicate_map_id = self.label_to_id(model_config.PREDICATE_LABEL)
        self.id_map_sequence = self.id_to_label(model_config.SEQ_LABEL)

    def id_to_label(self, labels):
        return dict([(i, label) for i, label in enumerate(labels)])

    def label_to_id(self, labels):
        return dict([(label, i) for i, label in enumerate(labels)])

    def _truncate_seq_pair(self, tokens_a, tokens_b, max_length):
        """Truncates a sequence pair in place to the maximum length."""

        # This is a simple heuristic which will always truncate the longer sequence
        # one token at a time. This makes more sense than truncating an equal percent
        # of tokens from each, since if one sequence is very short then each token
        # that's truncated likely contains more information than a longer sequence.
        while True:
            total_length = len(tokens_a) + len(tokens_b)
            if total_length <= max_length:
                break
            if len(tokens_a) > len(tokens_b):
                tokens_a.pop()
            else:
                tokens_b.pop()

    def process(self, sentences, predicate_labels, max_seq_length=64):
        if not sentences or (not isinstance(sentences, list)
                             and not isinstance(sentences, tuple)):
            raise ValueError(
                '`sentences` must be list object and not a empty list !')

        examples = []
        for sentence, predicate_label in zip(sentences, predicate_labels):
            feature = self.convert_single_example(sentence, predicate_label,
                                                  max_seq_length)
            example = self.convert_single_feature(feature)
            examples.append(example)

        return examples

    def convert_single_example(self, sentence, predicate_label,
                               max_seq_length):
        tokens = []
        for token in sentence:
            tokens.extend(self.tokenizer.tokenize(token))

        tokens_b = [predicate_label] * len(tokens)
        predicate_label_id = self.predicate_map_id[predicate_label]

        # 把 tokens 和 tokens_b 都截断到相等长度,并且长度的和小于 max_seq_length - 3
        self._truncate_seq_pair(tokens, tokens_b, max_seq_length - 3)

        tokens_a = []
        segment_ids = []
        tokens_a.append("[CLS]")
        segment_ids.append(0)
        for token in tokens:
            tokens_a.append(token)
            segment_ids.append(0)

        tokens_a.append("[SEP]")
        segment_ids.append(0)
        input_ids = self.tokenizer.convert_tokens_to_ids(tokens_a)

        # bert_tokenizer.convert_tokens_to_ids(["[SEP]"]) --->[102]
        # 1-100 dict index not used
        bias = 1
        for token in tokens_b:
            # add  bias for different from word dict
            tokens.append(token)
            input_ids.append(predicate_label_id + bias)
            segment_ids.append(1)

        tokens.append('[SEP]')
        # `[SEP]` index 等于 102
        input_ids.append(self.tokenizer.convert_tokens_to_ids(["[SEP]"])[0])
        segment_ids.append(1)

        input_mask = [1] * len(input_ids)

        while len(input_ids) < max_seq_length:
            input_ids.append(0)
            input_mask.append(0)
            segment_ids.append(0)
            tokens.append("[Padding]")

        assert len(input_ids) == max_seq_length
        assert len(input_mask) == max_seq_length
        assert len(segment_ids) == max_seq_length

        feature = InputFeatures(input_ids=input_ids,
                                input_mask=input_mask,
                                segment_ids=segment_ids)

        return feature

    def convert_single_feature(self, feature):
        features = dict()
        features['input_ids'] = tf.train.Feature(int64_list=tf.train.Int64List(
            value=feature.input_ids))
        features['input_mask'] = tf.train.Feature(
            int64_list=tf.train.Int64List(value=feature.input_mask))
        features['segment_ids'] = tf.train.Feature(
            int64_list=tf.train.Int64List(value=feature.segment_ids))
        example = tf.train.Example(features=tf.train.Features(
            feature=features))

        return example.SerializeToString()

    def infer(self,
              sentences,
              predicate_labels,
              max_seq_length,
              predicate_probabilities=None):
        """
        预测调用
        sentences: list,句子,['xxxx', 'xxxx'...]
        predicate_labels: list, 标签, ['作者', '出生地'...]
        max_seq_length: int
        predicate_probabilities: list, [0.92, 0.01, ...]
        :return:
        list, [
        [{'predicate': predicate, 'subject': subj, 'object': entity}, {'predicate': predicate...],
        [{'predicate': predicate, 'subject': subj, 'object': entity}, {'predicate': predicate...]...
        ]
        """
        examples = self.process(sentences, predicate_labels, max_seq_length)
        if self.url:
            predictions = self.tf_serving_infer(examples)
        else:
            s = time.time()
            predictions = self.local_infer(examples)
            print('sequence:', time.time() - s)

        token_label_predictions = predictions['token_label_predictions']
        predicate_predictions = predictions['predicate_predictions']
        predicate_labels_index = np.argmax(predicate_predictions, -1)

        result = []
        for i in range(len(sentences)):
            token_label = list(
                map(lambda x: self.id_map_sequence[x],
                    token_label_predictions[i]))
            entities = self.entity_extract(
                sentences[i], token_label[1:token_label.index('[SEP]')])
            predicate_label_index = predicate_labels_index[i]
            # 关系分类的模型输出 与 序列标注模型输出的结果比较
            if predicate_probabilities:
                predicate_label = max(
                    [(predicate_labels[i], predicate_probabilities[i]),
                     (self.id_map_predicate[predicate_label_index],
                      predicate_predictions[i][predicate_label_index])],
                    key=lambda x: x[1])
            else:
                predicate_label = predicate_predictions[i][
                    predicate_label_index]

            triplets = self.organize_triplet(entities, predicate_label[0])
            if triplets:
                result.append(triplets)

        return result

    def organize_triplet(self, entities, predicate):
        """
        把三元组转成字典形式, 可解决一个关系、一个主体(subject)、多个客体(object)
        entities: list, [('xx公司', 'SUB'), ('xx公司', 'OBJ')]
        predicate: str, 关系
        :return:
        list, [{'predicate': predicate, 'subject': subj, 'object': entity},
               {'predicate': predicate, 'subject': subj, 'object': entity}...]
        """
        triplets = []
        subj = None
        for entity, tag in entities:
            if tag == 'SUB':
                subj = entity
                break

        for entity, tag in entities:
            if tag == 'OBJ':
                triplet = {
                    'predicate': predicate,
                    'subject': subj,
                    'object': entity
                }
                triplets.append(triplet)

        return triplets

    def entity_extract(self, sentence, tags):
        """
        依据tags,从sentence抽取实体
        sentence: str,句子
        tags: list, 序列标记,例如 ['O', 'B-SUB', 'I-SUB'...]
        :return:
        list, [('xx公司', 'SUB'), ('xx公司', 'OBJ')]
        """
        entities = []
        sentence_len = len(sentence)
        if sentence_len != len(tags):
            warnings.warn(
                'Token and tags have different lengths.\ndetails:\n{}\n{}'.
                format(sentence, tags))

        entity = Entity(None)
        t_zip = zip(sentence, tags)

        for i, (token, tag) in enumerate(t_zip):
            if tag == 'O':
                if entity.types:
                    entities.append(entity.get_entity_types())
                    entity = Entity(None)
                continue

            elif tag[0] == 'B':
                if entity.types:
                    entities.append(entity.get_entity_types())
                entity = Entity(tag[2:])
                entity.begin = token

            elif tag[0] == 'I':
                if i == sentence_len - 1:
                    entity.intermediate = token
                    entities.append(entity.get_entity_types())
                    break

                try:
                    entity.intermediate = token
                except Exception as e:
                    print(e)

        return entities

    def tf_serving_infer(self, examples):
        self.request.inputs['examples'].CopyFrom(
            tf.make_tensor_proto(examples, dtype=types_pb2.DT_STRING))
        response = self.stub.Predict(self.request, 5.0)
        predictions = {}
        for key in response.outputs:
            tensor_proto = response.outputs[key]
            nd_array = tf.contrib.util.make_ndarray(tensor_proto)
            predictions[key] = nd_array

        return predictions

    def local_infer(self, examples):
        """
        本地进行预测,参数解释同上
        """
        predictions = self.predict_fn({'examples': examples})

        return predictions
Example #26
0
    return segments + [0] * (max_seq_length - len(tokens))


def get_ids(tokens, tokenizer, max_seq_length):
    """Token ids from Tokenizer vocab"""
    token_ids = tokenizer.convert_tokens_to_ids(tokens)
    input_ids = token_ids + [0] * (max_seq_length-len(token_ids))
    return input_ids


vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = FullTokenizer(vocab_file, do_lower_case)

s = "This is a nice sentence."
stokens = tokenizer.tokenize(s)
stokens = ["[CLS]"] + stokens + ["[SEP]"]
print('s       :', s)
print('stokens :', stokens)
print()

input_ids = get_ids(stokens, tokenizer, max_seq_length)
input_masks = get_masks(stokens, max_seq_length)
input_segments = get_segments(stokens, max_seq_length)

pool_embs, all_embs = model.predict([[input_ids],[input_masks],[input_segments]])

import numpy as np
cls_embs = np.array([all_embs[0][0]])

print('pool_embs.shape: ', pool_embs.shape)
Example #27
0
class BertEmbeddingsResolver:
    def __init__(self, model_folder, max_length=256, lowercase=True):

        # 1. Create tokenizer
        self.max_length = max_length
        vocab_file = os.path.join(model_folder, 'vocab.txt')
        self.tokenizer = FullTokenizer(vocab_file, do_lower_case=lowercase)

        # 2. Read Config
        config_file = os.path.join(model_folder, 'bert_config.json')
        self.config = BertConfig.from_json_file(config_file)

        # 3. Create Model
        self.session = tf.Session()
        self.token_ids_op = tf.placeholder(tf.int32,
                                           shape=(None, max_length),
                                           name='token_ids')
        self.model = BertModel(config=self.config,
                               is_training=False,
                               input_ids=self.token_ids_op,
                               use_one_hot_embeddings=False)

        # 4. Restore Trained Model
        self.saver = tf.train.Saver()
        ckpt_file = os.path.join(model_folder, 'bert_model.ckpt')
        # RCS ckpt_file = os.path.join(model_folder, 'model.ckpt-1000000')
        self.saver.restore(self.session, ckpt_file)

        hidden_layers = self.config.num_hidden_layers
        self.embeddings_op = tf.get_default_graph().get_tensor_by_name(
            "bert/encoder/Reshape_{}:0".format(hidden_layers + 1))

    def tokenize_sentence(self, tokens, add_service_tokens=True):
        result = []
        is_word_start = []
        for token in tokens:
            pieces = self.tokenizer.tokenize(token)
            result.extend(pieces)
            starts = [False] * len(pieces)
            starts[0] = True
            is_word_start.extend(starts)

        if add_service_tokens:
            if len(result) > self.max_length - 2:
                result = result[:self.max_length - 2]
                is_word_start = is_word_start[:self.max_length - 2]

            result = ['[CLS]'] + result + ['[SEP]']
            is_word_start = [False] + is_word_start + [False]
        else:
            if len(result) > self.max_length:
                result = result[:self.max_length]
                is_word_start = is_word_start[:self.max_length]

        return (result, is_word_start)

    def resolve_sentences(self, sentences):
        batch_is_word_start = []
        batch_token_ids = []
        batch_tokens = []

        for sentence in sentences:
            tokens, is_word_start = self.tokenize_sentence(sentence)
            token_ids = self.tokenizer.convert_tokens_to_ids(tokens)
            to_input = np.pad(token_ids,
                              [(0, self.max_length - len(token_ids))],
                              mode='constant')
            batch_token_ids.append(to_input)
            batch_tokens.append(tokens)
            batch_is_word_start.append(is_word_start)

        embeddings = self.session.run(
            self.embeddings_op, feed_dict={self.token_ids_op: batch_token_ids})

        result = []
        for i in range(len(sentences)):
            tokens = batch_tokens[i]
            is_word_start = batch_is_word_start[i]
            item_embeddings = embeddings[i, :len(tokens), :]

            resolved = TokenEmbeddings.create_sentence(tokens, is_word_start,
                                                       item_embeddings)
            result.append(resolved)

        return result

    def resolve_sentence(self, sentence):
        tokens, is_word_start = self.tokenize_sentence(sentence)

        token_ids = self.tokenizer.convert_tokens_to_ids(tokens)
        to_input = np.pad(token_ids, [(0, self.max_length - len(token_ids))],
                          mode='constant')
        to_input = to_input.reshape((1, self.max_length))

        embeddings = self.session.run(self.embeddings_op,
                                      feed_dict={self.token_ids_op: to_input})
        embeddings = np.squeeze(embeddings)
        embeddings = embeddings[:len(token_ids), :]

        return TokenEmbeddings.create_sentence(tokens, is_word_start,
                                               embeddings)
Example #28
0
class BERTVectorizer:
    def __init__(
        self,
        sess,
        is_bert,
        # bert_model_hub_path='https://tfhub.dev/google/bert_uncased_L-12_H-768_A-12/1'
        bert_model_hub_path="https://tfhub.dev/google/albert_base/1"):
        self.sess = sess
        self.is_bert = is_bert
        self.bert_model_hub_path = bert_model_hub_path
        self.create_tokenizer_from_hub_module(is_bert=is_bert)

    def create_tokenizer_from_hub_module(self, is_bert):
        """Get the vocab file and casing info from the Hub module."""
        bert_module = hub.Module(self.bert_model_hub_path)
        tokenization_info = bert_module(signature="tokenization_info",
                                        as_dict=True)
        vocab_file, do_lower_case = self.sess.run([
            tokenization_info["vocab_file"],
            tokenization_info["do_lower_case"],
        ])

        if is_bert:
            from bert.tokenization import FullTokenizer
            self.tokenizer = FullTokenizer(vocab_file=vocab_file,
                                           do_lower_case=do_lower_case)
        else:
            from vectorizers.albert_tokenization import FullTokenizer
            self.tokenizer = FullTokenizer(vocab_file=vocab_file,
                                           do_lower_case=do_lower_case,
                                           spm_model_file=vocab_file)

    def tokenize(self, text: str):
        words = text.split()  # whitespace tokenizer
        tokens = []
        valid_positions = []
        for i, word in enumerate(words):
            token = self.tokenizer.tokenize(word)
            tokens.extend(token)
            for i in range(len(token)):
                if i == 0:
                    valid_positions.append(1)
                else:
                    valid_positions.append(0)
        return tokens, valid_positions

    def transform(self, text_arr):
        input_ids = []
        input_mask = []
        segment_ids = []
        valid_positions = []
        for text in text_arr:
            ids, mask, seg_ids, valid_pos = self.__vectorize(text)
            input_ids.append(ids)
            input_mask.append(mask)
            segment_ids.append(seg_ids)
            valid_positions.append(valid_pos)

        sequence_lengths = np.array([len(i) for i in input_ids])
        input_ids = tf.keras.preprocessing.sequence.pad_sequences(
            input_ids, padding='post')
        input_mask = tf.keras.preprocessing.sequence.pad_sequences(
            input_mask, padding='post')
        segment_ids = tf.keras.preprocessing.sequence.pad_sequences(
            segment_ids, padding='post')
        valid_positions = tf.keras.preprocessing.sequence.pad_sequences(
            valid_positions, padding='post')
        return input_ids, input_mask, segment_ids, valid_positions, sequence_lengths

    def __vectorize(self, text: str):
        tokens, valid_positions = self.tokenize(text)
        # insert "[CLS]"
        tokens.insert(0, '[CLS]')
        valid_positions.insert(0, 1)
        # insert "[SEP]"
        tokens.append('[SEP]')
        valid_positions.append(1)

        segment_ids = [0] * len(tokens)
        input_ids = self.tokenizer.convert_tokens_to_ids(tokens)

        input_mask = [1] * len(input_ids)

        return input_ids, input_mask, segment_ids, valid_positions
Example #29
0
class Tuner(object):
    
    def __init__(self, train_corpus_fname = None,
                tokenized_train_corpus_fname = None,
                test_corpus_fname = None, tokenized_test_corpus_fname= None,
                model_name='bert', model_save_path = None, vocab_fname=None,
                eval_every=1000,
                batch_size=32, num_epochs=10, dropout_keep_prob_rate=0.9,
                model_ckpt_path=None):
        
        self.model_name = model_name
        self.eval_every = eval_every
        self.model_ckpt_path = model_ckpt_path
        self.model_save_path = model_save_path
        self.batch_size = batch_size
        self.num_epochs = num_epochs
        self.dropout_keep_prob_rate = dropout_keep_prob_rate
        self.best_valid_score = 0.0
        
        #tokenizer defining
        if self.model_name =='bert':
            self.tokenizer = FullTokenizer(vocab_file = vocab_fname, do_lower_case = False)
        else:
            self.tokenizer = get_tokenizer('mecab')
            
        #load or tokenize corpus
        
        self.train_data, self.train_data_size = self.load_or_tokenize_corpus(train_corpus_fname, tokenized_train_corpus_fname)
        self.test_data, self.test_data_size = self.load_or_tokenize_corpus(test_corpus_fname, tokenized_test_corpus_fname)
    
    
    def load_or_tokenize_corpus(self, corpus_fname, tokenized_corpus_fname):
        data_set = []
        if os.path.exists(tokenized_corpus_fname):
            tf.logging.info('load tokenized corpus : ' + tokenized_corpus_fname)
            with open(tokenized_corpus_fname, 'r') as f1:
                for line in f1:
                    tokens, label = line.strip().split('\u241E')
                    if len(tokens) > 0:
                        data_set.append([tokens.split(" "), int(label)])
                        
        else :
            with open(corpus_fname, 'r') as f2:
                next(f2) #skip head line
                for line in f2:
                    sentence, label = line.strip().split('\u241E')
                    if self.model_name == 'bert':
                        tokens = self.tokenizer.tokenize(sentence)
                    else:
                        tokens = self.tokenizer.morphs(sentence)
                        tokens = post_processing(tokens)
                    
                    #labelling
                    if int(label) >=1:
                        int_label = 1
                    else:
                        int_label = 0
                    data_set.append([tokens, int_label])
            with open(tokenized_corpus_fname, 'w') as f3:
                for tokens, label in data_set:
                    f3.writelines(' '.join(tokens) + '\u241E' + str(label) + '\n')
        return data_set, len(data_set)
    
    
    def get_batch(self, data, num_epochs, is_training = True):

        data_size = self.train_data_size
        num_batches_per_epoch = int((data_size - 1) / self.batch_size) + 1
        
        if is_training:
            for epoch in range(num_epocs):
                idx = random.sample(range(data_size), data_size)
                data = np.array(data)[idx]
                for batch_num in range(num_batches_per_epoch):
                    batch_sentences = []
                    batch_labels = []
                    start_index = batch_num * self.batch_size
                    end_index = min((batch_num+1)* self.batch_size, data_size)
                    features = data[start_index : end_index]
                    for features in features:
                        sentence, label = feature
                        batch_sentences.append(sentence)
                        batch_labels.append(int(label))
                    yield self.make_input(batch_sentences, batch_labels, is_training)
                    
                    
                    
    def train(self, sess, saver, global_step, output_feed):
        train_batches = self.get_batch(self.train_data, self.num_epochs, is_training=True)
        checkpoint_loss = 0.0
        for current_input_feed in train_batches:
            _,_,_, current_loss = sess.run(output_feed, current_input_feed)
            checkpoint_loss += current_loss
            if global_step.eval(sess) % self.eval_every == 0 :
                tf.logging.info("global step %d train loss %.4f" % (global_step.eval(sess), checkpoint_loss / self.eval_every))
                checkpoint_loss = 0.0
                self.validation(sess, saver, global_step)
                
                
    def validation(self, sess, saver, global_step):
        valid_loss, valid_pred, valid_num_data = 0,0,0
        output_feed = [self.logits, self.loss]
        test_batches = self.get_batch(self.test_data, num_epochs = 1, is_training= False)
        
        for current_input_feed, current_labels in test_batches:
            current_logits, current_loss = sess.run(output_feed, current_input_feed)
            current_preds = np.argmax(current_logits, axis= -1)
            valid_loss += current_loss
            valid_num_data += len(current_labels)
            for pred, label in zip(current_preds, current_labels):
                if pred == label :
                    valid_pred +=1
        valid_score = valid_pred / valid_num_data
        tf.logging.info('valid loss %.4f valid score %.4f'%(valid_loss, valid_score))
        
        if valid_score > self.best_valid_score:
            self.best_valid_score = valid_score
            path = self.model_save_path + '/' +str(valid_score)
            saver.save(sess, path, global_step=global_step)
            
            
        def make_input(self, sentences, labels, is_training):
            raise NotImplementedError
            
        def tune(self):
            raise NotImplementedError
class BERTModel:
    def __init__(self):
        bert_pretrained_dir = args.pretrain_models_path + args.bert_model_name
        self.do_lower_case = args.bert_model_name.startswith('uncased')
        self.vocab_file = os.path.join(bert_pretrained_dir, 'vocab.txt')
        self.config_file = os.path.join(bert_pretrained_dir,
                                        'bert_config.json')
        self.tokenizer = FullTokenizer(vocab_file=self.vocab_file,
                                       do_lower_case=self.do_lower_case)

        self.input_id = tf.placeholder(tf.int64, [None, None], 'input_ids')
        self.input_mask = tf.placeholder(tf.int64, [None, None], 'input_mask')
        self.segment_ids = tf.placeholder(tf.int64, [None, None],
                                          'segment_ids')

        bert_config = BertConfig.from_json_file(self.config_file)
        model = BertModel(config=bert_config,
                          is_training=False,
                          input_ids=self.input_id,
                          input_mask=self.input_mask,
                          token_type_ids=self.segment_ids,
                          use_one_hot_embeddings=True,
                          scope='bert')
        self.output_layer = model.get_sequence_output()
        self.embedding_layer = model.get_embedding_output()

        saver = tf.train.Saver()

        config = tf.ConfigProto()
        config.gpu_options.allow_growth = True
        self.session = tf.Session(config=config)
        saver.restore(self.session, bert_pretrained_dir + '/bert_model.ckpt')

    def tokenize(self, token_list, attributes_list):
        num_attributes = len(attributes_list)
        output_list = [[] for _ in range(num_attributes)]
        token_ids = []
        masks = []

        token_ids.append("[CLS]")
        for token_id, token in enumerate(token_list):
            new_tokens = self.tokenizer.tokenize(token)
            token_ids.extend(new_tokens)

            for att_id in range(num_attributes):
                l_ = [
                    attributes_list[att_id][token_id]
                    for _ in range(len(new_tokens))
                ]
                output_list[att_id].extend(l_)

            m = [0 for _ in range(len(new_tokens))]
            m[0] = 1
            masks.extend(m)

        token_ids.append("[SEP]")

        token_ids = self.tokenizer.convert_tokens_to_ids(token_ids)
        last_layer, embedding = self.get_embeddings(token_ids)

        if len(last_layer) != len(output_list[0]):
            print(token_list)
            print(token_ids)
            for list_i in output_list:
                print(list_i)

        assert len(last_layer) == len(output_list[0])

        return last_layer, embedding, token_ids[1:-1], output_list, masks

    def get_embeddings(self, token_ids):
        input_mask = [[1] * len(token_ids)]
        segment_ids = [[0] * len(token_ids)]
        input_id = [token_ids]

        outputs, emb = self.session.run(
            [self.output_layer, self.embedding_layer],
            feed_dict={
                self.input_mask: input_mask,
                self.segment_ids: segment_ids,
                self.input_id: input_id
            })

        return outputs[0][1:-1], emb[0][1:-1]

    def tokenize_sentence(self, token_list):
        token_ids = []

        token_ids.append("[CLS]")
        for token_id, token in enumerate(token_list):
            new_tokens = self.tokenizer.tokenize(token)
            token_ids.extend(new_tokens)
        token_ids.append("[SEP]")

        token_ids = self.tokenizer.convert_tokens_to_ids(token_ids)
        return token_ids[1:-1]