def _ner_bert_tokenize( tokens: List[str], mask: List[int], tags: List[str], tokenizer: FullTokenizer, max_subword_len: int = None, mode: str = None, token_maksing_prob: float = 0.0 ) -> Tuple[List[str], List[int], List[str]]: tokens_subword = ['[CLS]'] mask_subword = [0] tags_subword = ['X'] for token, flag, tag in zip(tokens, mask, tags): subwords = tokenizer.tokenize(token) if not subwords or \ ((max_subword_len is not None) and (len(subwords) > max_subword_len)): tokens_subword.append('[UNK]') mask_subword.append(flag) tags_subword.append(tag) else: if mode == 'train' and token_maksing_prob > 0.0 and np.random.rand( ) < token_maksing_prob: tokens_subword.extend(['[MASK]'] * len(subwords)) else: tokens_subword.extend(subwords) mask_subword.extend([flag] + [0] * (len(subwords) - 1)) tags_subword.extend([tag] + ['X'] * (len(subwords) - 1)) tokens_subword.append('[SEP]') mask_subword.append(0) tags_subword.append('X') return tokens_subword, mask_subword, tags_subword
def parse_text(text): sentences = text.split('\n\n') all_pos = Counter() all_dep = Counter() all_path = Counter() all_vocab = Counter() tokenizer = FullTokenizer(vocab_file=VOCAB_FILE, do_lower_case=DO_LOWER_CASE) for sentence in sentences: token_sequence = [] for token in sentence.split('\n'): if len(token) >= 8: token = token.split('\t') token_sequence.append(token) subwords = sum( [tokenizer.tokenize(item[0]) for item in token_sequence], []) all_vocab.update(subwords) all_pos.update([item[2] for item in token_sequence]) all_dep.update([item[3] for item in token_sequence]) all_path.update([item[4] for item in token_sequence]) return all_pos, all_dep, all_path, all_vocab
def bert_tokenize(vocab_fname, corpus_fname, output_fname): tokenizer = FullTokenizer(vocab_file=vocab_fname, do_lower_case=False) with open(corpus_fname, 'r', encoding='utf-8') as f1, \ open(output_fname, 'w', encoding='utf-8') as f2: for line in f1: sentence = line.replace('\n', '').strip() tokens = tokenizer.tokenize(convert_to_unicode(sentence)) tokenized_sent = ' '.join(tokens) f2.writelines(tokenized_sent + '\n')
class BertTokenizer: def __init__(self, bert_path, tokenizer_cls=FullTokenizer, maxlen=512): self.maxlen = maxlen # with tf.compat.v1.Session() as sess: # bert = hub.Module(bert_path) # tk_info = bert(signature='tokenization_info', as_dict=True) # tk_info = [tk_info['vocab_file'], tk_info['do_lower_case']] # vocab_file, do_lower_case = sess.run(tk_info) # self.tokenizer = tokenizer_cls(vocab_file, do_lower_case) bert_layer = hub.KerasLayer(bert_path, trainable=True) vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy() do_lower_case = bert_layer.resolved_object.do_lower_case.numpy() self.tokenizer = FullTokenizer(vocab_file, do_lower_case) def convert_sentences_to_ids(self, sentences): ids = list(map(self.convert_single_sentence_to_ids, sentences)) return np.array(ids) def convert_single_sentence_to_ids(self, sentence): tokens = self.tokenize(sentence) tokens = ['[CLS]'] + tokens + ['[SEP]'] tokens += (self.maxlen - len(tokens)) * ['[PAD]'] return self.tokenizer.convert_tokens_to_ids(tokens) def convert_two_sentence_to_ids(self, sent1, sent2, maxlen=None, return_tokens=False): if not maxlen: maxlen = self.maxlen tokens1 = self.tokenize(sent1) tokens2 = self.tokenize(sent2) if len(tokens1) + len(tokens2) > maxlen - 3: tokens2 = tokens2[:maxlen - 3 - len(tokens1)] tokens = ['[CLS]'] + tokens1 + ['[SEP]'] + tokens2 + ['[SEP]'] tokens += (maxlen - len(tokens)) * ['[PAD]'] ids = self.tokenizer.convert_tokens_to_ids(tokens) if return_tokens: return tokens1, tokens2, ids return ids def convert_sentence_to_features(self, sent1, sent2, maxlen=None): if not maxlen: maxlen = self.maxlen tokens1, tokens2, token_ids = self.convert_two_sentence_to_ids( sent1, sent2, maxlen, return_tokens=True) segment_ids = [0] * (len(tokens1) + 2) + [1] * (len(tokens2) + 1) input_mask = [1] * len(segment_ids) segment_ids += (maxlen - len(segment_ids)) * [0] input_mask += (maxlen - len(input_mask)) * [0] return token_ids, input_mask, segment_ids def tokenize(self, sent): return self.tokenizer.tokenize(sent)
class Inferer: def __init__(self, checkpoint, attr_values_file, vocab_file): self.checkpoint = checkpoint self.attr_values_file = attr_values_file self.vocab_file = vocab_file if not os.path.exists(self.checkpoint): raise Exception("local checkpoint %s not exists" % self.checkpoint) if not os.path.exists(self.attr_values_file): raise Exception("local attr_values_file %s not exists" % self.attr_values_file) if not os.path.exists(self.vocab_file): raise Exception("local vocab_file %s not exists" % self.vocab_file) self.config = InferConfig() self.tokenizer = FullTokenizer(self.vocab_file) with open(self.attr_values_file, 'rb') as fr: attr_values, attr_values_r = pickle.load(fr) self.attr_values_r = attr_values_r self.config.output_dim = len(attr_values_r) self.graph = tf.Graph() with self.graph.as_default(): self.input_ids_p = tf.placeholder(tf.int32, [None, self.config.max_seq_length]) self.token_type_ids_p = tf.placeholder(tf.int32, [None, self.config.max_seq_length]) self.input_mask_p = tf.placeholder(tf.int32, [None, self.config.max_seq_length]) model = Model(self.config) self.inference = model.infer(self.input_ids_p, self.token_type_ids_p, self.input_mask_p) ckpt_state = tf.train.get_checkpoint_state(self.checkpoint) if not (ckpt_state and ckpt_state.model_checkpoint_path): raise Exception('No model to eval yet at: ' + self.checkpoint) self.sess = tf.Session(config = tf.ConfigProto(allow_soft_placement = True)) saver = tf.train.Saver() saver.restore(self.sess, ckpt_state.model_checkpoint_path) def infer(self, sequences): transforms = [self._transform(s) for s in sequences if s != ''] input_ids, token_type_ids, input_mask = list(map(lambda x: list(x), zip(*transforms))) with self.graph.as_default(): result = self.sess.run(self.inference, feed_dict = { self.input_ids_p: input_ids, self.token_type_ids_p: token_type_ids, self.input_mask_p: input_mask }) return [self.attr_values_r[e] for e in result] def _transform(self, sequence): tokens = self.tokenizer.tokenize(sequence) if len(tokens) > self.config.max_seq_length - 2: tokens = tokens[0:self.config.max_seq_length - 2] tokens = ['[CLS]'] + tokens + ['[SEP]'] token_ids = self.tokenizer.convert_tokens_to_ids(tokens) input_ids_1 = token_ids[0:self.config.max_seq_length] + [0] * (self.config.max_seq_length - len(token_ids)) token_type_ids_1 = [0] * self.config.max_seq_length input_mask_1 = [1] * len(token_ids) + [0] * (self.config.max_seq_length - len(token_ids)) return input_ids_1, token_type_ids_1, input_mask_1
def test_compare(self): model_dir = tempfile.TemporaryDirectory().name os.makedirs(model_dir) save_path = MiniBertFactory.create_mini_bert_weights(model_dir) tokenizer = FullTokenizer(vocab_file=os.path.join( model_dir, "vocab.txt"), do_lower_case=True) # prepare input max_seq_len = 16 input_str = "hello, bert!" input_tokens = tokenizer.tokenize(input_str) input_tokens = ["[CLS]"] + input_tokens + ["[SEP]"] input_ids = tokenizer.convert_tokens_to_ids(input_tokens) input_ids = input_ids + [0] * (max_seq_len - len(input_tokens)) input_mask = [1] * len(input_tokens) + [0] * (max_seq_len - len(input_tokens)) token_type_ids = [0] * len(input_tokens) + [0] * (max_seq_len - len(input_tokens)) input_ids = np.array([input_ids], dtype=np.int32) input_mask = np.array([input_mask], dtype=np.int32) token_type_ids = np.array([token_type_ids], dtype=np.int32) print(" tokens:", input_tokens) print( "input_ids:{}/{}:{}".format(len(input_tokens), max_seq_len, input_ids), input_ids.shape, token_type_ids) bert_1_seq_out = CompareBertActivationsTest.predict_on_stock_model( model_dir, input_ids, input_mask, token_type_ids) bert_2_seq_out = CompareBertActivationsTest.predict_on_keras_model( model_dir, input_ids, input_mask, token_type_ids) np.set_printoptions(precision=9, threshold=20, linewidth=200, sign="+", floatmode="fixed") print("stock bert res", bert_1_seq_out.shape) print("keras bert res", bert_2_seq_out.shape) print("stock bert res:\n {}".format(bert_1_seq_out[0, :2, :10]), bert_1_seq_out.dtype) print("keras bert_res:\n {}".format(bert_2_seq_out[0, :2, :10]), bert_2_seq_out.dtype) abs_diff = np.abs(bert_1_seq_out - bert_2_seq_out).flatten() print("abs diff:", np.max(abs_diff), np.argmax(abs_diff)) self.assertTrue(np.allclose(bert_1_seq_out, bert_2_seq_out, atol=1e-6))
def test_finetune(self): model_dir = tempfile.TemporaryDirectory().name os.makedirs(model_dir) save_path = MiniBertFactory.create_mini_bert_weights(model_dir) tokenizer = FullTokenizer(vocab_file=os.path.join( model_dir, "vocab.txt"), do_lower_case=True) # prepare input max_seq_len = 24 input_str_batch = ["hello, bert!", "how are you doing!"] input_ids_batch = [] token_type_ids_batch = [] for input_str in input_str_batch: input_tokens = tokenizer.tokenize(input_str) input_tokens = ["[CLS]"] + input_tokens + ["[SEP]"] print("input_tokens len:", len(input_tokens)) input_ids = tokenizer.convert_tokens_to_ids(input_tokens) input_ids = input_ids + [0] * (max_seq_len - len(input_tokens)) token_type_ids = [0] * len(input_tokens) + [0] * ( max_seq_len - len(input_tokens)) input_ids_batch.append(input_ids) token_type_ids_batch.append(token_type_ids) input_ids = np.array(input_ids_batch, dtype=np.int32) token_type_ids = np.array(token_type_ids_batch, dtype=np.int32) print(" tokens:", input_tokens) print( "input_ids:{}/{}:{}".format(len(input_tokens), max_seq_len, input_ids), input_ids.shape, token_type_ids) model = CompareBertActivationsTest.load_keras_model( model_dir, max_seq_len) model.compile(optimizer=keras.optimizers.Adam(), loss=keras.losses.mean_squared_error) pres = model.predict([input_ids, token_type_ids ]) # just for fetching the shape of the output print("pres:", pres.shape) model.fit(x=(input_ids, token_type_ids), y=np.zeros_like(pres), batch_size=2, epochs=2)
def tokenize_document(doc_info: dict, tokenizer: FullTokenizer) -> dict: """ tokenize into sub tokens :param doc_info: :param tokenizer: :return: """ sub_tokens: List[str] = [] # all sub tokens of a document sentence_map: List[int] = [] # collected tokenized tokens -> sentence id subtoken_map: List[int] = [ ] # collected tokenized tokens -> original token id word_idx = -1 for sentence_id, sentence in enumerate(doc_info['sentences']): for token in sentence: word_idx += 1 word_tokens = tokenizer.tokenize(token) sub_tokens.extend(word_tokens) sentence_map.extend([sentence_id] * len(word_tokens)) subtoken_map.extend([word_idx] * len(word_tokens)) speakers = { subtoken_map.index(word_index): tokenizer.tokenize(speaker) for word_index, speaker in doc_info['speakers'] } clusters = [[(subtoken_map.index(start), len(subtoken_map) - 1 - subtoken_map[::-1].index(end)) for start, end in cluster] for cluster in doc_info['clusters']] tokenized_document = { 'sub_tokens': sub_tokens, 'sentence_map': sentence_map, 'subtoken_map': subtoken_map, 'speakers': speakers, 'clusters': clusters, 'doc_key': doc_info['doc_key'] } return tokenized_document
class BERTTokenizer(BaseTokenizer): def __init__(self, vocab_file=None, **kwargs): if vocab_file is None: raise ValueError( 'Vocabulary file is required to initialize BERT tokenizer' ) from bert.tokenization import FullTokenizer self.tokenizer = FullTokenizer(vocab_file) def __call__(self, text): return self.tokenizer.tokenize(text)
def test_direct_keras_to_stock_compare(self): from tests.ext.modeling import BertModel, BertConfig, get_assignment_map_from_checkpoint bert_config = BertConfig.from_json_file(self.bert_config_file) tokenizer = FullTokenizer( vocab_file=os.path.join(self.bert_ckpt_dir, "vocab.txt")) # prepare input max_seq_len = 6 input_str = "Hello, Bert!" input_tokens = tokenizer.tokenize(input_str) input_tokens = ["[CLS]"] + input_tokens + ["[SEP]"] input_ids = tokenizer.convert_tokens_to_ids(input_tokens) input_ids = input_ids + [0] * (max_seq_len - len(input_tokens)) input_mask = [1] * len(input_tokens) + [0] * (max_seq_len - len(input_tokens)) token_type_ids = [0] * len(input_tokens) + [0] * (max_seq_len - len(input_tokens)) input_ids = np.array([input_ids], dtype=np.int32) input_mask = np.array([input_mask], dtype=np.int32) token_type_ids = np.array([token_type_ids], dtype=np.int32) print(" tokens:", input_tokens) print( "input_ids:{}/{}:{}".format(len(input_tokens), max_seq_len, input_ids), input_ids.shape, token_type_ids) s_res = self.predict_on_stock_model(input_ids, input_mask, token_type_ids) k_res = self.predict_on_keras_model(input_ids, input_mask, token_type_ids) np.set_printoptions(precision=9, threshold=20, linewidth=200, sign="+", floatmode="fixed") print("s_res", s_res.shape) print("k_res", k_res.shape) print("s_res:\n {}".format(s_res[0, :2, :10]), s_res.dtype) print("k_res:\n {}".format(k_res[0, :2, :10]), k_res.dtype) adiff = np.abs(s_res - k_res).flatten() print("diff:", np.max(adiff), np.argmax(adiff)) self.assertTrue(np.allclose(s_res, k_res, atol=1e-6))
class BERTTokenizer(BaseTokenizer): def __init__(self, vocab_file=None, **kwargs): super().__init__() if vocab_file is None: raise ValueError( 'Vocabulary file is required to initialize BERT tokenizer') try: from bert.tokenization import FullTokenizer except ImportError: raise ValueError( "Please install bert-tensorflow: pip install bert-tensorflow") self.tokenizer = FullTokenizer(vocab_file) def __call__(self, text): return ['[CLS]'] + self.tokenizer.tokenize(text) + ['[SEP]']
def tokenize_single_input(text, tokenizer: btk.FullTokenizer, max_input_length): tokens = ['[CLS]'] tokens += tokenizer.tokenize(text) token_ids = tokenizer.convert_tokens_to_ids(tokens) token_masks = [1] * len(token_ids) segment_ids = [0] * max_input_length if len(token_ids) > max_input_length: raise ValueError( 'The input is %i while the maximum input can be only %i.' % (len(token_ids), max_input_length)) while len(token_ids) != max_input_length: token_ids.append(0) token_masks.append(0) return token_ids, token_masks, segment_ids
def tokenize_data(input_str_batch, max_seq_len, model_dir): tokenizer = FullTokenizer(vocab_file=os.path.join(model_dir, "vocab.txt"), do_lower_case=True) input_ids_batch = [] token_type_ids_batch = [] for input_str in input_str_batch: input_tokens = tokenizer.tokenize(input_str) input_tokens = ["[CLS]"] + input_tokens + ["[SEP]"] print("input_tokens len:", len(input_tokens)) input_ids = tokenizer.convert_tokens_to_ids(input_tokens) if len(input_tokens) > max_seq_len: input_ids = input_ids[:max_seq_len] else: input_ids = input_ids + [0] * (max_seq_len - len(input_tokens)) # token_type_ids = [0] * len(input_tokens) + [0] * (max_seq_len - len(input_tokens)) token_type_ids = [0] * max_seq_len input_ids_batch.append(input_ids) token_type_ids_batch.append(token_type_ids) return input_ids_batch, token_type_ids_batch
class DisasterDetector: def __init__(self, bert_layer, max_sql, lr, batch_size, epochs): self.bert_layer = bert_layer self.max_sql = max_sql vocab = self.bert_layer.resolved_object.vocab_file.asset_path.numpy() lowercase = self.bert_layer.resolved_object.do_lower_case.numpy() self.token = FullTokenizer(vocab, lowercase) self.lr = lr self.batch_size = batch_size self.epochs = epochs self.models = [] self.scores = {} def encode(self, texts): all_tokens = [] all_masks = [] all_segments = [] for text in texts: text = self.token.tokenize(text) text = text[:self.max_sql - 2] input_seq = ['[CLS]'] + text + ['[SEP]'] pad_len = self.max_sql - len(input_seq) tokens = self.token.convert_tokens_to_ids(input_seq) tokens += [0] * pad_len pad_masks = [1] * len(input_seq) + [0] * pad_len segment_ids = [0] * self.max_sql all_tokens.append(tokens) all_masks.append(pad_masks) all_segments.append(segment_ids) return np.array(all_tokens), np.array(all_masks), np.array( all_segments) def build_model(self): input_words = Input(shape=(self.max_sql, ), dtype=tf.int32, name='input_words') input_mask = Input(shape=(self.max_sql, ), dtype=tf.int32, name='input_mask') segmentids = Input(shape=(self.max_sql, ), dtype=tf.int32, name='segment_ids') _, sequence_output = self.bert_layer( [input_words, input_mask, segmentids]) # without pooled output clf_output = sequence_output[:, 0, :] out = Dense(1, activation='sigmoid')(clf_output) model = Model(inputs=[input_words, input_mask, segmentids], outputs=out) optimizer = Adam(learning_rate=self.lr) model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy']) return model def fit(self, x): xtrain, xval, ytrain, yval = train_test_split(x, x.target_relabeled, test_size=0.2, random_state=878) ytrain = xtrain.target_relabeled xtrain = self.encode(xtrain.cleaned.str.lower()) yval = xval.target_relabeled xval = self.encode(xval.cleaned.str.lower()) metrics = ClassificationReport(train=(xtrain, ytrain), val=(xval, yval)) checkpoint = ModelCheckpoint('model_BERT.h5', monitor='val_loss', save_best_only=True) model = self.build_model() model.fit(xtrain, ytrain, validation_data=(xval, yval), callbacks=[metrics, checkpoint], epochs=self.epochs, batch_size=self.batch_size) def predict(self, x): model = self.build_model() model.load_weights('model_BERT.h5') xtest = self.encode(x.cleaned.str.lower()) ypred = model.predict(xtest) return ypred
class BertPreprocessor(Preprocessor): """Preprocessor for BERT embedding. This class can be used to do all the work to create the inputs (and outputs) of a Neural Network using BERT as embedding. Currently only single sequence classification is supported. Source: https://github.com/google-research/bert_keras """ def __init__(self, pretrained_model_path: str, **kwargs): super().__init__(**kwargs) info = hub.Module(spec=pretrained_model_path)( signature="tokenization_info", as_dict=True) with tf.Session() as sess: vocab_file, do_lower_case = sess.run( [info["vocab_file"], info["do_lower_case"]]) # Create the tokenizer with the vocabulary of the pretrained model self._tokenizer = FullTokenizer(vocab_file=vocab_file, do_lower_case=do_lower_case) basic_tokens = self._tokenizer.convert_tokens_to_ids( ["[CLS]", "[SEP]"]) self._CLS_token = basic_tokens[0] self._SEP_token = basic_tokens[1] def _truncate_seq_pair(self, tokens_a, tokens_b, max_length): """Truncates a sequence pair in place to the maximum length.""" # This is a simple heuristic which will always truncate the longer sequence # one token at a time. This makes more sense than truncating an equal percent # of tokens from each, since if one sequence is very short then each token # that's truncated likely contains more information than a longer sequence. while True: total_length = len(tokens_a) + len(tokens_b) if total_length <= max_length: break if len(tokens_a) > len(tokens_b): tokens_a.pop() else: tokens_b.pop() def _padding_sentence(self): """Return a zero length sentence to pad last batch. :return: Three sequences of zeros (tokens, masks, segment ids). """ return [0] * self._max_seq_len, [0] * self._max_seq_len, [ 0 ] * self._max_seq_len def tokenize(self, text_a: str, text_b: str = None): """Convert sequence(s) of words into sequence(s) of tokens and also compute the masking- and segment ids. For further details please read BERT paper. :param text_a: First sequence :param text_b: Second sequence :return: The sequence of tokens, masks and segment ids. """ input_ids = [0] * self._max_seq_len # The mask has 1 for real tokens and 0 for padding tokens. Only real tokens are attended to. input_mask = [0] * self._max_seq_len # The segment ids are 0 for text_a and 1 for text_b input_segment_ids = [0] * self._max_seq_len tokens_a = self._tokenizer.tokenize(text_a) tokens_b = None if text_b: tokens_b = self._tokenizer.tokenize(text_b) if tokens_b: # Modifies `tokens_a` and `tokens_b` in place so that the total # length is less than the specified length. # Account for [CLS], [SEP], [SEP] with "- 3" self._truncate_seq_pair(tokens_a, tokens_b, self._max_seq_len - 3) else: # Account for [CLS] and [SEP] with "- 2" if len(tokens_a) > self._max_seq_len - 2: tokens_a = tokens_a[0:(self._max_seq_len - 2)] idx = 0 input_ids[idx] = self._CLS_token idx += 1 for element in self._tokenizer.convert_tokens_to_ids(tokens_a): input_ids[idx] = element input_mask[idx] = 1 idx += 1 if tokens_b: for element in self._tokenizer.convert_tokens_to_ids(tokens_b): input_ids[idx] = element input_mask[idx] = 1 input_segment_ids[idx] = 1 idx += 1 input_ids[idx] = self._SEP_token # The mask has 1 for real tokens and 0 for padding tokens. Only real tokens are attended to. for i in range(idx + 1): input_mask[i] = 1 # safety check assert len(input_ids) == self._max_seq_len assert len(input_mask) == self._max_seq_len assert len(input_segment_ids) == self._max_seq_len return input_ids, input_mask, input_segment_ids def fit(self, texts: List[str]) -> 'BertPreprocessor': """This function does nothing in case of BERT but must be implemented. :param texts: - :return: self """ return self def transform(self, examples: List[InputExample]) -> list: """Transform sequences of words into sequences of tokens, masks and segment ids. Masks are used to separate valid and padding tokens. Here the segment ids are always one since the whole sequence belongs together. For further details please read BERT paper. :param texts: The sequences of texts. :return: The sequences of tokens, masks and segment ids. """ input_ids, input_masks, segment_ids = [], [], [] for i, example in enumerate(examples): input_id, input_mask, segment_id = self.tokenize( text_a=example.text_a, text_b=example.text_b) input_ids.append(input_id) input_masks.append(input_mask) segment_ids.append(segment_id) return [ np.array(input_ids), np.array(input_masks), np.array(segment_ids) ] def inverse_transform(self, sequences: np.ndarray): """Transform sequences of tokens back to sequences of words (sentences). :param sequences: The sequences of tokens. :return: The sequences of words """ return self._tokenizer.convert_ids_to_tokens(sequences)
class BertNer(object): def __init__(self, **kwargs): self.tf = import_tf(kwargs['gpu_no'], kwargs['verbose']) self.logger = set_logger('BertNer', kwargs['log_dir'], kwargs['verbose']) self.model_dir = kwargs['ner_model'] from bert.tokenization import FullTokenizer self.tokenizer = FullTokenizer( os.path.join(self.model_dir, 'vocab.txt')) self.ner_sq_len = 128 self.input_ids = self.tf.placeholder(self.tf.int32, (None, self.ner_sq_len), 'input_ids') self.input_mask = self.tf.placeholder(self.tf.int32, (None, self.ner_sq_len), 'input_mask') # init graph self._init_graph() # init ner assist data self._init_predict_var() self.per_proun = [ '甲', '乙', '丙', '丁', '戊', '己', '庚', '辛', '壬', '癸', '子', '丑', '寅', '卯', '辰', '巳', '午', '未', '申', '酉', '戌', '亥' ] def _init_graph(self): """ init bert ner graph :return: """ try: with self.tf.gfile.GFile( os.path.join(self.model_dir, 'ner_model.pb'), 'rb') as f: graph_def = self.tf.GraphDef() graph_def.ParseFromString(f.read()) input_map = { "input_ids:0": self.input_ids, 'input_mask:0': self.input_mask } self.pred_ids = self.tf.import_graph_def( graph_def, name='', input_map=input_map, return_elements=['pred_ids:0'])[0] graph = self.pred_ids.graph sess_config = self.tf.ConfigProto(allow_soft_placement=True) sess_config.gpu_options.allow_growth = True self.sess = self.tf.Session(graph=graph, config=sess_config) self.sess.run(self.tf.global_variables_initializer()) self.tf.reset_default_graph() except Exception as e: self.logger.error(e) def _init_predict_var(self): """ initialize assist of bert ner :return: labels num of ner, label to id dict, id to label dict """ with open(os.path.join(self.model_dir, 'label2id.pkl'), 'rb') as rf: self.id2label = { value: key for key, value in pickle.load(rf).items() } def _convert_lst_to_features(self, lst_str, is_tokenized=True, mask_cls_sep=False): """ Loads a data file into a list of `InputBatch`s. :param lst_str: list str :param is_tokenized: whether token unknown word :param mask_cls_sep: masking the embedding on [CLS] and [SEP] with zero. :return: input feature instance """ from bert.extract_features import read_tokenized_examples, read_examples, InputFeatures examples = read_tokenized_examples( lst_str) if is_tokenized else read_examples(lst_str) _tokenize = lambda x: self.tokenizer.mark_unk_tokens( x) if is_tokenized else self.tokenizer.tokenize(x) for (ex_index, example) in enumerate(examples): tokens_a = _tokenize(example.text_a) tokens_b = None if example.text_b: tokens_b = _tokenize(example.text_b) if tokens_b: # Modifies `tokens_a` and `tokens_b` in place so that the total # length is less than the specified length. # Account for [CLS], [SEP], [SEP] with "- 3" self._truncate_seq_pair(tokens_a, tokens_b) else: # Account for [CLS] and [SEP] with "- 2" if len(tokens_a) > self.ner_sq_len - 2: tokens_a = tokens_a[0:(self.ner_sq_len - 2)] # The convention in BERT is: # (a) For sequence pairs: # tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP] # type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1 # (b) For single sequences: # tokens: [CLS] the dog is hairy . [SEP] # type_ids: 0 0 0 0 0 0 0 # # Where "type_ids" are used to indicate whether this is the first # sequence or the second sequence. The embedding vectors for `type=0` and # `type=1` were learned during pre-training and are added to the wordpiece # embedding vector (and position vector). This is not *strictly* necessary # since the [SEP] token unambiguously separates the sequences, but it makes # it easier for the model to learn the concept of sequences. # # For classification tasks, the first vector (corresponding to [CLS]) is # used as as the "sentence vector". Note that this only makes sense because # the entire model is fine-tuned. tokens = ['[CLS]'] + tokens_a + ['[SEP]'] input_type_ids = [0] * len(tokens) input_mask = [int(not mask_cls_sep) ] + [1] * len(tokens_a) + [int(not mask_cls_sep)] if tokens_b: tokens += tokens_b + ['[SEP]'] input_type_ids += [1] * (len(tokens_b) + 1) input_mask += [1] * len(tokens_b) + [int(not mask_cls_sep)] input_ids = self.tokenizer.convert_tokens_to_ids(tokens) # Zero-pad up to the sequence length. more pythonic pad_len = self.ner_sq_len - len(input_ids) input_ids += [0] * pad_len input_mask += [0] * pad_len input_type_ids += [0] * pad_len assert len(input_ids) == self.ner_sq_len assert len(input_mask) == self.ner_sq_len assert len(input_type_ids) == self.ner_sq_len yield InputFeatures(input_ids=input_ids, input_mask=input_mask, input_type_ids=input_type_ids) def _truncate_seq_pair(self, tokens_a, tokens_b): """ Truncates a sequence pair in place to the maximum length. :param tokens_a: text a :param tokens_b: text b """ try: while True: total_length = len(tokens_a) + len(tokens_b) if total_length <= self.ner_sq_len - 3: break if len(tokens_a) > len(tokens_b): tokens_a.pop() else: tokens_b.pop() except: self.logger.error() def _convert_id_to_label(self, pred_ids_result, batch_size): """ turn id to label :param pred_ids_result: predict result :param batch_size: batch size of predict ids result :return: label list """ result = [] index_result = [] for row in range(batch_size): curr_seq = [] curr_idx = [] ids = pred_ids_result[row] for idx, id in enumerate(ids): if id == 0: break curr_label = self.id2label[id] if curr_label in ['[CLS]', '[SEP]']: if id == 102 and (idx < len(ids) and ids[idx + 1] == 0): break continue # elif curr_label == '[SEP]': # break curr_seq.append(curr_label) curr_idx.append(id) result.append(curr_seq) index_result.append(curr_idx) return result, index_result def predict(self, contents): """ bert ner predict :param content_list: content list :return: predict result """ try: splited_contents = [] all_terms = [] for content in contents: content_len = len(content) if content_len % self.ner_sq_len - 2 == 0: terms = int(content_len / (self.ner_sq_len - 2)) else: terms = int(content_len / (self.ner_sq_len - 2)) + 1 all_terms.append(terms) for i in range(terms): splited_contents.append( content[i * (self.ner_sq_len - 2):(i + 1) * (self.ner_sq_len - 2)]) tmp_f = list(self._convert_lst_to_features(splited_contents)) input_ids = [f.input_ids for f in tmp_f] input_masks = [f.input_mask for f in tmp_f] pred_result = self.sess.run(self.pred_ids, feed_dict={ self.input_ids: input_ids, self.input_mask: input_masks }) # restore to original string tmp = [] index = 0 for terms in all_terms: sub_preds = [] for i in range(terms): sub_preds.extend(pred_result[index + i]) tmp.append(sub_preds) index += terms pred_result = tmp pred_result = self._convert_id_to_label(pred_result, len(pred_result))[0] # zip str predict id str_pred = [] for w in zip(contents, pred_result): sub_list = [] for z in zip(list(w[0]), w[1]): sub_list.append([z[0], z[1]]) str_pred.append(sub_list) # get ner ner_result = [self._combine_ner(s) for s in str_pred] return ner_result except Exception as e: self.logger.error(e) return [[]] def _combine_ner(self, pred_result): """ combine ner :param pred_result: model predict result and origin content words list :return: entity words and index """ words_len = len(pred_result) i = 0 tmp = '' _ner_list = [] while i < words_len: word = pred_result[i] # add personal pronoun if word[0] in self.per_proun and word[1][0] == 'O': _ner_list.append([word[0], 'PER']) if word[1][0] == 'O' and tmp is not '': _ner_list.append([tmp, pred_result[i - 1][1][2:]]) tmp = '' elif word[1][0] == 'I': tmp = tmp + word[0] if i == words_len - 1: _ner_list.append([tmp, word[1][2:]]) elif word[1][0] == 'B': if tmp is not '': _ner_list.append([tmp, pred_result[i - 1][1][2:]]) tmp = word[0] if i == words_len - 1: _ner_list.append([tmp, word[1][2:]]) i += 1 return _ner_list
def progSuccess(): global stProg text = request.form['progTextField'] s = text[0:512] final_model = keras.models.load_model( '/home/suiSense/my_site/final_regular_model.h5') realSuicidal = "According to our algorithm, the text has been classified as suicidal." realDepression = "According to our algorithm, the text has been classified as depression, not suicidal." max_seq_length = 512 # Your choice here. input_word_ids = tf.keras.layers.Input(shape=(max_seq_length, ), dtype=tf.int32, name="input_word_ids") input_mask = tf.keras.layers.Input(shape=(max_seq_length, ), dtype=tf.int32, name="input_mask") segment_ids = tf.keras.layers.Input(shape=(max_seq_length, ), dtype=tf.int32, name="segment_ids") bert_layer = hub.KerasLayer( "https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1", trainable=True) pooled_output, sequence_output = bert_layer( [input_word_ids, input_mask, segment_ids]) model = Model(inputs=[input_word_ids, input_mask, segment_ids], outputs=[pooled_output, sequence_output]) # See BERT paper: https://arxiv.org/pdf/1810.04805.pdf # And BERT implementation convert_single_example() at https://github.com/google-research/bert/blob/master/run_classifier.py vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy() do_lower_case = bert_layer.resolved_object.do_lower_case.numpy() tokenizer = FullTokenizer(vocab_file, do_lower_case) stokens = tokenizer.tokenize(s) stokens = ["[CLS]"] + stokens + ["[SEP]"] input_ids = get_ids(stokens, tokenizer, max_seq_length) input_masks = get_masks(stokens, max_seq_length) input_segments = get_segments(stokens, max_seq_length) pool_embs, all_embs = model.predict([[input_ids], [input_masks], [input_segments]]) predictions = final_model.predict(pool_embs) predictionPercentage = predictions[0][0] * 100 if (0.0 < predictions <= 0.500): return render_template("progressionSuccess.html", contents="depression", intvar=predictionPercentage) elif (0.500 < predictions <= 1.000): return render_template("progressionSuccess.html", contents="suicidal", intvar=predictionPercentage) try: reloadWebsite() except: print('reload failed')
from bert.tokenization import FullTokenizer from sacremoses import MosesTokenizer, MosesDetokenizer from sequence_transfer.sequence import CharSequence, TokenSequence from sequence_transfer.plugin.entity_annotation_transfer_plugin import EntityAnnotationTransferPlugin, \ EntityAnnotationSequence from sequence_transfer.magic_transfer import MagicTransfer # We create a char sequence sequence text = " J'adore Zoé! " char_sequence = CharSequence.new(text) # We create the token sequence tokenizer = FullTokenizer('vocab.txt') tokens = tokenizer.tokenize(text) print(tokens) tokenizer = MosesTokenizer('fr') tokens = tokenizer.tokenize(text) print(tokens) detokenizer = MosesDetokenizer('fr') y = detokenizer.detokenize(tokens) print(y) exit() token_sequence = TokenSequence.new(tokens) # We create a magic transfer transfer = MagicTransfer(char_sequence, token_sequence)
class BERTEmbeddingEvaluator(SentenceEmbeddingEvaluator): def __init__( self, model_fname="/notebooks/embedding/data/sentence-embeddings/bert/tune-ckpt", bertconfig_fname="/notebooks/embedding/data/sentence-embeddings/bert/multi_cased_L-12_H-768_A-12/bert_config.json", vocab_fname="/notebooks/embedding/data/sentence-embeddings/bert/multi_cased_L-12_H-768_A-12/vocab.txt", max_seq_length=32, dimension=768, num_labels=2, use_notebook=False): super().__init__("bert", dimension, use_notebook) config = BertConfig.from_json_file(bertconfig_fname) self.max_seq_length = max_seq_length self.tokenizer = FullTokenizer(vocab_file=vocab_fname, do_lower_case=False) self.model, self.input_ids, self.input_mask, self.segment_ids, self.probs = make_bert_graph( config, max_seq_length, 1.0, num_labels, tune=False) saver = tf.train.Saver(tf.global_variables()) self.sess = tf.Session() checkpoint_path = tf.train.latest_checkpoint(model_fname) saver.restore(self.sess, checkpoint_path) def predict(self, sentence): tokens = self.tokenize(sentence) model_input = self.make_input(tokens) probs = self.sess.run(self.probs, model_input) return probs """ sentence를 입력하면 토크나이즈 결과와 token 벡터 시퀀스를 반환한다 - shape :[[# of tokens], [batch size, max seq length, dimension]] """ def get_token_vector_sequence(self, sentence): tokens = self.tokenize(sentence) model_input = self.make_input(tokens) return [ tokens, self.sess.run(self.model.get_sequence_output()[0], model_input)[:len(tokens) + 2] ] """ sentence를 입력하면 토크나이즈 결과와 [CLS] 벡터를 반환한다 - shape :[[# of tokens], [batch size, dimension]] """ def get_sentence_vector(self, sentence): tokens = self.tokenize(sentence) model_input = self.make_input(tokens) return [ tokens, self.sess.run(self.model.pooled_output, model_input)[0] ] """ sentence를 입력하면 토크나이즈 결과와 self-attention score matrix를 반환한다 - shape :[[# of tokens], [batch size, # of tokens, # of tokens]] """ def get_self_attention_score(self, sentence): tokens = self.tokenize(sentence) model_input = self.make_input(tokens) # raw_score : shape=[# of layers, batch_size, num_attention_heads, max_seq_length, max_seq_length] raw_score = self.sess.run(self.model.attn_probs_for_visualization_list, model_input) # 마지막 레이어를 취한 뒤, attention head 기준(axis=0)으로 sum scores = np.sum(raw_score[-1][0], axis=0) # scores matrix에서 토큰 개수만큼 취함 scores = scores[:len(tokens), :len(tokens)] return [tokens, scores] def tokenize(self, sentence): return self.tokenizer.tokenize(convert_to_unicode(sentence)) def make_input(self, tokens): tokens = tokens[:(self.max_seq_length - 2)] token_sequence = ["[CLS]"] + tokens + ["[SEP]"] segment = [0] * len(token_sequence) sequence = self.tokenizer.convert_tokens_to_ids(token_sequence) current_length = len(sequence) padding_length = self.max_seq_length - current_length input_feed = { self.input_ids: np.array([sequence + [0] * padding_length]), self.segment_ids: np.array([segment + [0] * padding_length]), self.input_mask: np.array([[1] * current_length + [0] * padding_length]) } return input_feed def visualize_self_attention_scores(self, sentence): tokens, scores = self.get_self_attention_score(sentence) visualize_self_attention_scores(tokens, scores, use_notebook=self.use_notebook)
def advancedProgSuccess(): global stProg #bringing in the text baseline_text = request.form['baselineOne'] + ' ' + request.form[ 'baselineTwo'] + ' ' + request.form['baselineThree'] final_text = request.form['recentOne'] + ' ' + request.form[ 'recentTwo'] + ' ' + request.form['recentThree'] #text truncation for bert baseline_text = baseline_text[0:512] final_text = final_text[0:512] #initializing models final_model = keras.models.load_model( '/home/suiSense/my_site/final_regular_model.h5') baseline_model = keras.models.load_model( '/home/suiSense/my_site/baseline_model.h5') #bringing in the bert model to apply for all the text max_seq_length = 512 input_word_ids = tf.keras.layers.Input(shape=(max_seq_length, ), dtype=tf.int32, name="input_word_ids") input_mask = tf.keras.layers.Input(shape=(max_seq_length, ), dtype=tf.int32, name="input_mask") segment_ids = tf.keras.layers.Input(shape=(max_seq_length, ), dtype=tf.int32, name="segment_ids") bert_layer = hub.KerasLayer( "https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1", trainable=True) pooled_output, sequence_output = bert_layer( [input_word_ids, input_mask, segment_ids]) model = Model(inputs=[input_word_ids, input_mask, segment_ids], outputs=[pooled_output, sequence_output]) vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy() do_lower_case = bert_layer.resolved_object.do_lower_case.numpy() tokenizer = FullTokenizer(vocab_file, do_lower_case) #baseline model, baseline text stokensOne = tokenizer.tokenize(baseline_text) stokensOne = ["[CLS]"] + stokensOne + ["[SEP]"] input_ids = get_ids(stokensOne, tokenizer, max_seq_length) input_masks = get_masks(stokensOne, max_seq_length) input_segments = get_segments(stokensOne, max_seq_length) pool_embs, all_embs = model.predict([[input_ids], [input_masks], [input_segments]]) fxOne = baseline_model.predict(pool_embs) fxOne = fxOne[0][0] #baseline model, suicidal text stokensTwo = tokenizer.tokenize(final_text) stokensTwo = ["[CLS]"] + stokensTwo + ["[SEP]"] input_ids = get_ids(stokensTwo, tokenizer, max_seq_length) input_masks = get_masks(stokensTwo, max_seq_length) input_segments = get_segments(stokensTwo, max_seq_length) pool_embs, all_embs = model.predict([[input_ids], [input_masks], [input_segments]]) fxTwo = baseline_model.predict(pool_embs) fxTwo = fxTwo[0][0] #suicidal model, baseline text stokensThree = tokenizer.tokenize(baseline_text) stokensThree = ["[CLS]"] + stokensThree + ["[SEP]"] input_ids = get_ids(stokensThree, tokenizer, max_seq_length) input_masks = get_masks(stokensThree, max_seq_length) input_segments = get_segments(stokensThree, max_seq_length) pool_embs, all_embs = model.predict([[input_ids], [input_masks], [input_segments]]) gxOne = final_model.predict(pool_embs) gxOne = gxOne[0][0] #suicidal model, suicidal text stokensFour = tokenizer.tokenize(final_text) stokensFour = ["[CLS]"] + stokensFour + ["[SEP]"] input_ids = get_ids(stokensFour, tokenizer, max_seq_length) input_masks = get_masks(stokensFour, max_seq_length) input_segments = get_segments(stokensFour, max_seq_length) pool_embs, all_embs = model.predict([[input_ids], [input_masks], [input_segments]]) gxTwo = final_model.predict(pool_embs) gxTwo = gxTwo[0][0] if (fxTwo > 0.5): if (fxOne > 0.5): predictionPercentage = (gxTwo - gxOne) * 100 else: predictionPercentage = ((gxTwo + 1) - fxOne) * 100 else: if (fxOne > 0.5): predictionPercentage = (fxTwo - (gxOne + 1)) * 100 else: predictionPercentage = (fxTwo - fxOne) * 100 significant_digits = 3 predictionPercentage = round( predictionPercentage, significant_digits - int(math.floor(math.log10(abs(predictionPercentage)))) - 1) absPredictionPercentage = abs(predictionPercentage) return render_template("advancedProgressionSuccess.html", intvar=predictionPercentage, absintvar=absPredictionPercentage, fxOne=fxOne, fxTwo=fxTwo, gxOne=gxOne, gxTwo=gxTwo) try: reloadWebsite() except: print('reload failed')
class TrainDataReader(): def __init__(self, config, category_dir, vocab_file): self.config = config self.category_dir = category_dir self.tokenizer = FullTokenizer(vocab_file) if not os.path.exists( os.path.join(self.category_dir, 'train_data', 'raw.csv')): raise Exception("local raw train data not exists!!") if not os.path.exists(vocab_file): raise Exception("local vocab_file not exists") def transform(self): with open(os.path.join(self.category_dir, 'train_data', 'raw.csv')) as fr, \ open(os.path.join(self.category_dir, 'attr_values.pkl'), 'wb') as fwa: attr_values_c = {} for row in fr: if row.strip() == '' or len(row.strip().split('\t')) != 10: continue segment = row.strip().split('\t') attr_values_c[(segment[8], segment[9])] = 1 attr_values = {k: i for i, k in enumerate(attr_values_c.keys())} attr_values_r = {i: k for k, i in attr_values.items()} print('start to write local attr_values.pkl!!') pickle.dump((attr_values, attr_values_r), fwa) with open(os.path.join(self.category_dir, 'train_data', 'raw.csv')) as fr, \ open(os.path.join(self.category_dir, 'train_data', 'transform.csv'), 'w') as fwt: print('start to write local train_data transform.csv!!') for row in fr: if row.strip() == '' or len(row.strip().split('\t')) != 10: continue segment = row.strip().split('\t') label = attr_values[(segment[8], segment[9])] tokens = self.tokenizer.tokenize(segment[7]) if len(tokens) > self.config.max_seq_length - 2: tokens = tokens[0:self.config.max_seq_length - 2] tokens = ['[CLS]'] + tokens + ['[SEP]'] token_ids = self.tokenizer.convert_tokens_to_ids(tokens) token_ids_patch = token_ids[0:self.config.max_seq_length] + [ 0 ] * (self.config.max_seq_length - len(token_ids)) token_ids_patch = list(map(lambda x: str(x), token_ids_patch)) fwt.write( str(label) + ',' + str(min(len(token_ids), len(token_ids_patch))) + ',' + ','.join(token_ids_patch) + '\n') return len(attr_values) def read(self): transform = os.path.join(self.category_dir, 'train_data', 'transform.csv') queue = tf.train.string_input_producer([transform]) reader = tf.TextLineReader() _, value = reader.read(queue) row = tf.decode_csv(value, [[0]] * (self.config.max_seq_length + 2)) label = tf.stack(row[0]) length = tf.stack(row[1]) mask = tf.cast(tf.sequence_mask(length, self.config.max_seq_length), tf.int32) sequence = tf.stack(row[2:self.config.max_seq_length + 2]) return tf.train.shuffle_batch([label, sequence, mask], self.config.batch_size, 50000, 10000)
input_file ) # [:1] # array of lists [text, target_word_idx, correct_word_idx] with open('100_texts', 'w') as f: for example in data: f.write(example[0]) bert_tokens = [] token_map = [] tokenizer = FullTokenizer(vocab_file=bert + 'vocab.txt', do_lower_case=False) for text in data: text_tokens = ['[CLS]'] text_map = [] for word in text[0].split(' '): text_map.append(len(text_tokens)) text_tokens.extend(tokenizer.tokenize(word)) token_map.append(text_map) bert_tokens.append(text_tokens) args = ['python', '../bert/extract_features.py'] args.append('--input_file=100_texts') args.append('--output_file=anaphora') args.append('--vocab_file=' + bert + 'vocab.txt') args.append('--bert_config_file=' + bert + 'bert_config.json') args.append('--init_checkpoint=' + bert + 'bert_model.ckpt') args.append('--layers=' + layers) args.append('--max_seq_length=128') args.append('--batch_size=8') args.append('--do_lower_case=False') args.append('--attention=True')
class Tuner(object): def __init__(self, train_corpus_fname=None, tokenized_train_corpus_fname=None, test_corpus_fname=None, tokenized_test_corpus_fname=None, model_name="bert", model_save_path=None, vocab_fname=None, eval_every=1000, batch_size=32, num_epochs=10, dropout_keep_prob_rate=0.9, model_ckpt_path=None, sp_model_path=None): # configurations tf.logging.set_verbosity(tf.logging.INFO) self.model_name = model_name self.eval_every = eval_every self.model_ckpt_path = model_ckpt_path self.model_save_path = model_save_path self.batch_size = batch_size self.num_epochs = num_epochs self.dropout_keep_prob_rate = dropout_keep_prob_rate self.best_valid_score = 0.0 if not os.path.exists(model_save_path): os.mkdir(model_save_path) # define tokenizer if self.model_name == "bert": self.tokenizer = FullTokenizer(vocab_file=vocab_fname, do_lower_case=False) elif self.model_name == "xlnet": sp = spm.SentencePieceProcessor() sp.Load(sp_model_path) self.tokenizer = sp else: self.tokenizer = get_tokenizer("mecab") # load or tokenize corpus self.train_data, self.train_data_size = self.load_or_tokenize_corpus(train_corpus_fname, tokenized_train_corpus_fname) self.test_data, self.test_data_size = self.load_or_tokenize_corpus(test_corpus_fname, tokenized_test_corpus_fname) def load_or_tokenize_corpus(self, corpus_fname, tokenized_corpus_fname): data_set = [] if os.path.exists(tokenized_corpus_fname): tf.logging.info("load tokenized corpus : " + tokenized_corpus_fname) with open(tokenized_corpus_fname, 'r') as f1: for line in f1: tokens, label = line.strip().split("\u241E") if len(tokens) > 0: data_set.append([tokens.split(" "), int(label)]) else: tf.logging.info("tokenize corpus : " + corpus_fname + " > " + tokenized_corpus_fname) with open(corpus_fname, 'r') as f2: next(f2) # skip head line for line in f2: sentence, label = line.strip().split("\u241E") if self.model_name == "bert": tokens = self.tokenizer.tokenize(sentence) elif self.model_name == "xlnet": normalized_sentence = preprocess_text(sentence, lower=False) tokens = encode_pieces(self.tokenizer, normalized_sentence, return_unicode=False, sample=False) else: tokens = self.tokenizer.morphs(sentence) tokens = post_processing(tokens) if int(label) > 0.5: int_label = 1 else: int_label = 0 data_set.append([tokens, int_label]) with open(tokenized_corpus_fname, 'w') as f3: for tokens, label in data_set: f3.writelines(' '.join(tokens) + "\u241E" + str(label) + "\n") return data_set, len(data_set) def train(self, sess, saver, global_step, output_feed): train_batches = self.get_batch(self.train_data, num_epochs=self.num_epochs, is_training=True) checkpoint_loss = 0.0 for current_input_feed in train_batches: _, _, _, current_loss = sess.run(output_feed, current_input_feed) checkpoint_loss += current_loss if global_step.eval(sess) % self.eval_every == 0: tf.logging.info("global step %d train loss %.4f" % (global_step.eval(sess), checkpoint_loss / self.eval_every)) checkpoint_loss = 0.0 self.validation(sess, saver, global_step) def validation(self, sess, saver, global_step): valid_loss, valid_pred, valid_num_data = 0, 0, 0 output_feed = [self.logits, self.loss] test_batches = self.get_batch(self.test_data, num_epochs=1, is_training=False) for current_input_feed, current_labels in test_batches: current_logits, current_loss = sess.run(output_feed, current_input_feed) current_preds = np.argmax(current_logits, axis=-1) valid_loss += current_loss valid_num_data += len(current_labels) for pred, label in zip(current_preds, current_labels): if pred == label: valid_pred += 1 valid_score = valid_pred / valid_num_data tf.logging.info("valid loss %.4f valid score %.4f" % (valid_loss, valid_score)) if valid_score > self.best_valid_score: self.best_valid_score = valid_score path = self.model_save_path + "/" + str(valid_score) saver.save(sess, path, global_step=global_step) def get_batch(self, data, num_epochs, is_training=True): if is_training: data_size = self.train_data_size else: data_size = self.test_data_size num_batches_per_epoch = int((data_size - 1) / self.batch_size) if is_training: tf.logging.info("num_batches_per_epoch : " + str(num_batches_per_epoch)) for epoch in range(num_epochs): idx = random.sample(range(data_size), data_size) data = np.array(data)[idx] for batch_num in range(num_batches_per_epoch): batch_sentences = [] batch_labels = [] start_index = batch_num * self.batch_size end_index = (batch_num + 1) * self.batch_size features = data[start_index:end_index] for feature in features: sentence, label = feature batch_sentences.append(sentence) batch_labels.append(int(label)) yield self.make_input(batch_sentences, batch_labels, is_training) def make_input(self, sentences, labels, is_training): raise NotImplementedError def tune(self): raise NotImplementedError
class BertPreprocessor(Preprocessor): """Preprocessor for BERT embedding. This class can be used to do all the work to create the inputs (and outputs) of a Neural Network using BERT as embedding. Currently only single sequence classification is supported. """ def __init__(self, pretrained_model_path: str, **kwargs): super().__init__(**kwargs) info = hub.Module(spec=pretrained_model_path)(signature="tokenization_info", as_dict=True) with tf.Session() as sess: vocab_file, do_lower_case = sess.run( [ info["vocab_file"], info["do_lower_case"] ] ) # Create the tokenizer with the vocabulary of the pretrained model self._tokenizer = FullTokenizer(vocab_file=vocab_file, do_lower_case=do_lower_case) basic_tokens = self._tokenizer.convert_tokens_to_ids(["[CLS]", "[SEP]"]) self._CLS_token = basic_tokens[0] self._SEP_token = basic_tokens[1] def _padding_sentence(self): """Return a zero length sentence to pad last batch. :return: Three sequences of zeros (tokens, masks, segment ids). """ return [0] * self._max_seq_len, [0] * self._max_seq_len, [0] * self._max_seq_len def tokenize(self, text: str): """Convert a sequence of words into a sequence of tokens and also compute the masking- and segment ids. For further details please read BERT paper. :param text: The sequence of words. :return: The sequence of tokens, masks and segment ids. """ input_ids = [0] * self._max_seq_len input_mask = [0] * self._max_seq_len input_segment_ids = [0] * self._max_seq_len tokens_input = self._tokenizer.tokenize(text) # if too long cut to size (the first token will be [CLS], the last [SEP]) if len(tokens_input) > self._max_seq_len - 2: tokens_input = tokens_input[0: (self._max_seq_len - 2)] idx = 0 input_ids[idx] = self._CLS_token idx += 1 for element in self._tokenizer.convert_tokens_to_ids(tokens_input): input_ids[idx] = element idx += 1 input_ids[idx] = self._SEP_token # The mask has 1 for real tokens and 0 for padding tokens. Only real tokens are attended to. for i in range(idx + 1): input_mask[i] = 1 # safety check assert len(input_ids) == self._max_seq_len assert len(input_mask) == self._max_seq_len assert len(input_segment_ids) == self._max_seq_len return input_ids, input_mask, input_segment_ids def fit(self, texts: List[str]) -> 'BertPreprocessor': """This function does nothing in case of BERT but must be implemented. :param texts: - :return: self """ return self def transform(self, texts: List[str]) -> list: """Transform sequences of words into sequences of tokens, masks and segment ids. Masks are used to separate valid and padding tokens. Here the segment ids are always one since the whole sequence belongs together. For further details please read BERT paper. :param texts: The sequences of texts. :return: The sequences of tokens, masks and segment ids. """ input_masks = np.empty([len(texts), self._max_seq_len], dtype=np.int64) segment_ids = np.empty([len(texts), self._max_seq_len], dtype=np.int64) # input_ids, input_masks, segment_ids = [], [], [] input_ids, input_masks, segment_ids = zip(*Pool(processes=8).map(self.tokenize, texts)) # for i, text in enumerate(texts): # input_ids[i], input_masks[i], segment_ids[i] = self.tokenize(text=text) # input_id, input_mask, segment_id = self.tokenize(text=text) # input_ids.append(input_id) # input_masks.append(input_mask) # segment_ids.append(segment_id) # return [np.array(input_ids), np.array(input_masks), np.array(segment_ids)] return [input_ids, input_masks, segment_ids] def inverse_transform(self, sequences: np.ndarray): """Transform sequences of tokens back to sequences of words (sentences). :param sequences: The sequences of tokens. :return: The sequences of words """ return self._tokenizer.convert_ids_to_tokens(sequences)
class EntityInfer(LoadModelBase): def __init__(self, vocab_file, export_dir=None, url=None, model_name='models', signature_name=None, do_lower_case=True): super(EntityInfer, self).__init__(export_dir, url, model_name, signature_name) self.tokenizer = FullTokenizer(vocab_file=vocab_file, do_lower_case=do_lower_case) # 通过 grpc if url: self.stub, self.request = self.load_grpc_connect() if export_dir: self.predict_fn = self.load_pb_model() self.id_map_predicate = self.id_to_label(model_config.PREDICATE_LABEL) self.predicate_map_id = self.label_to_id(model_config.PREDICATE_LABEL) self.id_map_sequence = self.id_to_label(model_config.SEQ_LABEL) def id_to_label(self, labels): return dict([(i, label) for i, label in enumerate(labels)]) def label_to_id(self, labels): return dict([(label, i) for i, label in enumerate(labels)]) def _truncate_seq_pair(self, tokens_a, tokens_b, max_length): """Truncates a sequence pair in place to the maximum length.""" # This is a simple heuristic which will always truncate the longer sequence # one token at a time. This makes more sense than truncating an equal percent # of tokens from each, since if one sequence is very short then each token # that's truncated likely contains more information than a longer sequence. while True: total_length = len(tokens_a) + len(tokens_b) if total_length <= max_length: break if len(tokens_a) > len(tokens_b): tokens_a.pop() else: tokens_b.pop() def process(self, sentences, predicate_labels, max_seq_length=64): if not sentences or (not isinstance(sentences, list) and not isinstance(sentences, tuple)): raise ValueError( '`sentences` must be list object and not a empty list !') examples = [] for sentence, predicate_label in zip(sentences, predicate_labels): feature = self.convert_single_example(sentence, predicate_label, max_seq_length) example = self.convert_single_feature(feature) examples.append(example) return examples def convert_single_example(self, sentence, predicate_label, max_seq_length): tokens = [] for token in sentence: tokens.extend(self.tokenizer.tokenize(token)) tokens_b = [predicate_label] * len(tokens) predicate_label_id = self.predicate_map_id[predicate_label] # 把 tokens 和 tokens_b 都截断到相等长度,并且长度的和小于 max_seq_length - 3 self._truncate_seq_pair(tokens, tokens_b, max_seq_length - 3) tokens_a = [] segment_ids = [] tokens_a.append("[CLS]") segment_ids.append(0) for token in tokens: tokens_a.append(token) segment_ids.append(0) tokens_a.append("[SEP]") segment_ids.append(0) input_ids = self.tokenizer.convert_tokens_to_ids(tokens_a) # bert_tokenizer.convert_tokens_to_ids(["[SEP]"]) --->[102] # 1-100 dict index not used bias = 1 for token in tokens_b: # add bias for different from word dict tokens.append(token) input_ids.append(predicate_label_id + bias) segment_ids.append(1) tokens.append('[SEP]') # `[SEP]` index 等于 102 input_ids.append(self.tokenizer.convert_tokens_to_ids(["[SEP]"])[0]) segment_ids.append(1) input_mask = [1] * len(input_ids) while len(input_ids) < max_seq_length: input_ids.append(0) input_mask.append(0) segment_ids.append(0) tokens.append("[Padding]") assert len(input_ids) == max_seq_length assert len(input_mask) == max_seq_length assert len(segment_ids) == max_seq_length feature = InputFeatures(input_ids=input_ids, input_mask=input_mask, segment_ids=segment_ids) return feature def convert_single_feature(self, feature): features = dict() features['input_ids'] = tf.train.Feature(int64_list=tf.train.Int64List( value=feature.input_ids)) features['input_mask'] = tf.train.Feature( int64_list=tf.train.Int64List(value=feature.input_mask)) features['segment_ids'] = tf.train.Feature( int64_list=tf.train.Int64List(value=feature.segment_ids)) example = tf.train.Example(features=tf.train.Features( feature=features)) return example.SerializeToString() def infer(self, sentences, predicate_labels, max_seq_length, predicate_probabilities=None): """ 预测调用 sentences: list,句子,['xxxx', 'xxxx'...] predicate_labels: list, 标签, ['作者', '出生地'...] max_seq_length: int predicate_probabilities: list, [0.92, 0.01, ...] :return: list, [ [{'predicate': predicate, 'subject': subj, 'object': entity}, {'predicate': predicate...], [{'predicate': predicate, 'subject': subj, 'object': entity}, {'predicate': predicate...]... ] """ examples = self.process(sentences, predicate_labels, max_seq_length) if self.url: predictions = self.tf_serving_infer(examples) else: s = time.time() predictions = self.local_infer(examples) print('sequence:', time.time() - s) token_label_predictions = predictions['token_label_predictions'] predicate_predictions = predictions['predicate_predictions'] predicate_labels_index = np.argmax(predicate_predictions, -1) result = [] for i in range(len(sentences)): token_label = list( map(lambda x: self.id_map_sequence[x], token_label_predictions[i])) entities = self.entity_extract( sentences[i], token_label[1:token_label.index('[SEP]')]) predicate_label_index = predicate_labels_index[i] # 关系分类的模型输出 与 序列标注模型输出的结果比较 if predicate_probabilities: predicate_label = max( [(predicate_labels[i], predicate_probabilities[i]), (self.id_map_predicate[predicate_label_index], predicate_predictions[i][predicate_label_index])], key=lambda x: x[1]) else: predicate_label = predicate_predictions[i][ predicate_label_index] triplets = self.organize_triplet(entities, predicate_label[0]) if triplets: result.append(triplets) return result def organize_triplet(self, entities, predicate): """ 把三元组转成字典形式, 可解决一个关系、一个主体(subject)、多个客体(object) entities: list, [('xx公司', 'SUB'), ('xx公司', 'OBJ')] predicate: str, 关系 :return: list, [{'predicate': predicate, 'subject': subj, 'object': entity}, {'predicate': predicate, 'subject': subj, 'object': entity}...] """ triplets = [] subj = None for entity, tag in entities: if tag == 'SUB': subj = entity break for entity, tag in entities: if tag == 'OBJ': triplet = { 'predicate': predicate, 'subject': subj, 'object': entity } triplets.append(triplet) return triplets def entity_extract(self, sentence, tags): """ 依据tags,从sentence抽取实体 sentence: str,句子 tags: list, 序列标记,例如 ['O', 'B-SUB', 'I-SUB'...] :return: list, [('xx公司', 'SUB'), ('xx公司', 'OBJ')] """ entities = [] sentence_len = len(sentence) if sentence_len != len(tags): warnings.warn( 'Token and tags have different lengths.\ndetails:\n{}\n{}'. format(sentence, tags)) entity = Entity(None) t_zip = zip(sentence, tags) for i, (token, tag) in enumerate(t_zip): if tag == 'O': if entity.types: entities.append(entity.get_entity_types()) entity = Entity(None) continue elif tag[0] == 'B': if entity.types: entities.append(entity.get_entity_types()) entity = Entity(tag[2:]) entity.begin = token elif tag[0] == 'I': if i == sentence_len - 1: entity.intermediate = token entities.append(entity.get_entity_types()) break try: entity.intermediate = token except Exception as e: print(e) return entities def tf_serving_infer(self, examples): self.request.inputs['examples'].CopyFrom( tf.make_tensor_proto(examples, dtype=types_pb2.DT_STRING)) response = self.stub.Predict(self.request, 5.0) predictions = {} for key in response.outputs: tensor_proto = response.outputs[key] nd_array = tf.contrib.util.make_ndarray(tensor_proto) predictions[key] = nd_array return predictions def local_infer(self, examples): """ 本地进行预测,参数解释同上 """ predictions = self.predict_fn({'examples': examples}) return predictions
return segments + [0] * (max_seq_length - len(tokens)) def get_ids(tokens, tokenizer, max_seq_length): """Token ids from Tokenizer vocab""" token_ids = tokenizer.convert_tokens_to_ids(tokens) input_ids = token_ids + [0] * (max_seq_length-len(token_ids)) return input_ids vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy() do_lower_case = bert_layer.resolved_object.do_lower_case.numpy() tokenizer = FullTokenizer(vocab_file, do_lower_case) s = "This is a nice sentence." stokens = tokenizer.tokenize(s) stokens = ["[CLS]"] + stokens + ["[SEP]"] print('s :', s) print('stokens :', stokens) print() input_ids = get_ids(stokens, tokenizer, max_seq_length) input_masks = get_masks(stokens, max_seq_length) input_segments = get_segments(stokens, max_seq_length) pool_embs, all_embs = model.predict([[input_ids],[input_masks],[input_segments]]) import numpy as np cls_embs = np.array([all_embs[0][0]]) print('pool_embs.shape: ', pool_embs.shape)
class BertEmbeddingsResolver: def __init__(self, model_folder, max_length=256, lowercase=True): # 1. Create tokenizer self.max_length = max_length vocab_file = os.path.join(model_folder, 'vocab.txt') self.tokenizer = FullTokenizer(vocab_file, do_lower_case=lowercase) # 2. Read Config config_file = os.path.join(model_folder, 'bert_config.json') self.config = BertConfig.from_json_file(config_file) # 3. Create Model self.session = tf.Session() self.token_ids_op = tf.placeholder(tf.int32, shape=(None, max_length), name='token_ids') self.model = BertModel(config=self.config, is_training=False, input_ids=self.token_ids_op, use_one_hot_embeddings=False) # 4. Restore Trained Model self.saver = tf.train.Saver() ckpt_file = os.path.join(model_folder, 'bert_model.ckpt') # RCS ckpt_file = os.path.join(model_folder, 'model.ckpt-1000000') self.saver.restore(self.session, ckpt_file) hidden_layers = self.config.num_hidden_layers self.embeddings_op = tf.get_default_graph().get_tensor_by_name( "bert/encoder/Reshape_{}:0".format(hidden_layers + 1)) def tokenize_sentence(self, tokens, add_service_tokens=True): result = [] is_word_start = [] for token in tokens: pieces = self.tokenizer.tokenize(token) result.extend(pieces) starts = [False] * len(pieces) starts[0] = True is_word_start.extend(starts) if add_service_tokens: if len(result) > self.max_length - 2: result = result[:self.max_length - 2] is_word_start = is_word_start[:self.max_length - 2] result = ['[CLS]'] + result + ['[SEP]'] is_word_start = [False] + is_word_start + [False] else: if len(result) > self.max_length: result = result[:self.max_length] is_word_start = is_word_start[:self.max_length] return (result, is_word_start) def resolve_sentences(self, sentences): batch_is_word_start = [] batch_token_ids = [] batch_tokens = [] for sentence in sentences: tokens, is_word_start = self.tokenize_sentence(sentence) token_ids = self.tokenizer.convert_tokens_to_ids(tokens) to_input = np.pad(token_ids, [(0, self.max_length - len(token_ids))], mode='constant') batch_token_ids.append(to_input) batch_tokens.append(tokens) batch_is_word_start.append(is_word_start) embeddings = self.session.run( self.embeddings_op, feed_dict={self.token_ids_op: batch_token_ids}) result = [] for i in range(len(sentences)): tokens = batch_tokens[i] is_word_start = batch_is_word_start[i] item_embeddings = embeddings[i, :len(tokens), :] resolved = TokenEmbeddings.create_sentence(tokens, is_word_start, item_embeddings) result.append(resolved) return result def resolve_sentence(self, sentence): tokens, is_word_start = self.tokenize_sentence(sentence) token_ids = self.tokenizer.convert_tokens_to_ids(tokens) to_input = np.pad(token_ids, [(0, self.max_length - len(token_ids))], mode='constant') to_input = to_input.reshape((1, self.max_length)) embeddings = self.session.run(self.embeddings_op, feed_dict={self.token_ids_op: to_input}) embeddings = np.squeeze(embeddings) embeddings = embeddings[:len(token_ids), :] return TokenEmbeddings.create_sentence(tokens, is_word_start, embeddings)
class BERTVectorizer: def __init__( self, sess, is_bert, # bert_model_hub_path='https://tfhub.dev/google/bert_uncased_L-12_H-768_A-12/1' bert_model_hub_path="https://tfhub.dev/google/albert_base/1"): self.sess = sess self.is_bert = is_bert self.bert_model_hub_path = bert_model_hub_path self.create_tokenizer_from_hub_module(is_bert=is_bert) def create_tokenizer_from_hub_module(self, is_bert): """Get the vocab file and casing info from the Hub module.""" bert_module = hub.Module(self.bert_model_hub_path) tokenization_info = bert_module(signature="tokenization_info", as_dict=True) vocab_file, do_lower_case = self.sess.run([ tokenization_info["vocab_file"], tokenization_info["do_lower_case"], ]) if is_bert: from bert.tokenization import FullTokenizer self.tokenizer = FullTokenizer(vocab_file=vocab_file, do_lower_case=do_lower_case) else: from vectorizers.albert_tokenization import FullTokenizer self.tokenizer = FullTokenizer(vocab_file=vocab_file, do_lower_case=do_lower_case, spm_model_file=vocab_file) def tokenize(self, text: str): words = text.split() # whitespace tokenizer tokens = [] valid_positions = [] for i, word in enumerate(words): token = self.tokenizer.tokenize(word) tokens.extend(token) for i in range(len(token)): if i == 0: valid_positions.append(1) else: valid_positions.append(0) return tokens, valid_positions def transform(self, text_arr): input_ids = [] input_mask = [] segment_ids = [] valid_positions = [] for text in text_arr: ids, mask, seg_ids, valid_pos = self.__vectorize(text) input_ids.append(ids) input_mask.append(mask) segment_ids.append(seg_ids) valid_positions.append(valid_pos) sequence_lengths = np.array([len(i) for i in input_ids]) input_ids = tf.keras.preprocessing.sequence.pad_sequences( input_ids, padding='post') input_mask = tf.keras.preprocessing.sequence.pad_sequences( input_mask, padding='post') segment_ids = tf.keras.preprocessing.sequence.pad_sequences( segment_ids, padding='post') valid_positions = tf.keras.preprocessing.sequence.pad_sequences( valid_positions, padding='post') return input_ids, input_mask, segment_ids, valid_positions, sequence_lengths def __vectorize(self, text: str): tokens, valid_positions = self.tokenize(text) # insert "[CLS]" tokens.insert(0, '[CLS]') valid_positions.insert(0, 1) # insert "[SEP]" tokens.append('[SEP]') valid_positions.append(1) segment_ids = [0] * len(tokens) input_ids = self.tokenizer.convert_tokens_to_ids(tokens) input_mask = [1] * len(input_ids) return input_ids, input_mask, segment_ids, valid_positions
class Tuner(object): def __init__(self, train_corpus_fname = None, tokenized_train_corpus_fname = None, test_corpus_fname = None, tokenized_test_corpus_fname= None, model_name='bert', model_save_path = None, vocab_fname=None, eval_every=1000, batch_size=32, num_epochs=10, dropout_keep_prob_rate=0.9, model_ckpt_path=None): self.model_name = model_name self.eval_every = eval_every self.model_ckpt_path = model_ckpt_path self.model_save_path = model_save_path self.batch_size = batch_size self.num_epochs = num_epochs self.dropout_keep_prob_rate = dropout_keep_prob_rate self.best_valid_score = 0.0 #tokenizer defining if self.model_name =='bert': self.tokenizer = FullTokenizer(vocab_file = vocab_fname, do_lower_case = False) else: self.tokenizer = get_tokenizer('mecab') #load or tokenize corpus self.train_data, self.train_data_size = self.load_or_tokenize_corpus(train_corpus_fname, tokenized_train_corpus_fname) self.test_data, self.test_data_size = self.load_or_tokenize_corpus(test_corpus_fname, tokenized_test_corpus_fname) def load_or_tokenize_corpus(self, corpus_fname, tokenized_corpus_fname): data_set = [] if os.path.exists(tokenized_corpus_fname): tf.logging.info('load tokenized corpus : ' + tokenized_corpus_fname) with open(tokenized_corpus_fname, 'r') as f1: for line in f1: tokens, label = line.strip().split('\u241E') if len(tokens) > 0: data_set.append([tokens.split(" "), int(label)]) else : with open(corpus_fname, 'r') as f2: next(f2) #skip head line for line in f2: sentence, label = line.strip().split('\u241E') if self.model_name == 'bert': tokens = self.tokenizer.tokenize(sentence) else: tokens = self.tokenizer.morphs(sentence) tokens = post_processing(tokens) #labelling if int(label) >=1: int_label = 1 else: int_label = 0 data_set.append([tokens, int_label]) with open(tokenized_corpus_fname, 'w') as f3: for tokens, label in data_set: f3.writelines(' '.join(tokens) + '\u241E' + str(label) + '\n') return data_set, len(data_set) def get_batch(self, data, num_epochs, is_training = True): data_size = self.train_data_size num_batches_per_epoch = int((data_size - 1) / self.batch_size) + 1 if is_training: for epoch in range(num_epocs): idx = random.sample(range(data_size), data_size) data = np.array(data)[idx] for batch_num in range(num_batches_per_epoch): batch_sentences = [] batch_labels = [] start_index = batch_num * self.batch_size end_index = min((batch_num+1)* self.batch_size, data_size) features = data[start_index : end_index] for features in features: sentence, label = feature batch_sentences.append(sentence) batch_labels.append(int(label)) yield self.make_input(batch_sentences, batch_labels, is_training) def train(self, sess, saver, global_step, output_feed): train_batches = self.get_batch(self.train_data, self.num_epochs, is_training=True) checkpoint_loss = 0.0 for current_input_feed in train_batches: _,_,_, current_loss = sess.run(output_feed, current_input_feed) checkpoint_loss += current_loss if global_step.eval(sess) % self.eval_every == 0 : tf.logging.info("global step %d train loss %.4f" % (global_step.eval(sess), checkpoint_loss / self.eval_every)) checkpoint_loss = 0.0 self.validation(sess, saver, global_step) def validation(self, sess, saver, global_step): valid_loss, valid_pred, valid_num_data = 0,0,0 output_feed = [self.logits, self.loss] test_batches = self.get_batch(self.test_data, num_epochs = 1, is_training= False) for current_input_feed, current_labels in test_batches: current_logits, current_loss = sess.run(output_feed, current_input_feed) current_preds = np.argmax(current_logits, axis= -1) valid_loss += current_loss valid_num_data += len(current_labels) for pred, label in zip(current_preds, current_labels): if pred == label : valid_pred +=1 valid_score = valid_pred / valid_num_data tf.logging.info('valid loss %.4f valid score %.4f'%(valid_loss, valid_score)) if valid_score > self.best_valid_score: self.best_valid_score = valid_score path = self.model_save_path + '/' +str(valid_score) saver.save(sess, path, global_step=global_step) def make_input(self, sentences, labels, is_training): raise NotImplementedError def tune(self): raise NotImplementedError
class BERTModel: def __init__(self): bert_pretrained_dir = args.pretrain_models_path + args.bert_model_name self.do_lower_case = args.bert_model_name.startswith('uncased') self.vocab_file = os.path.join(bert_pretrained_dir, 'vocab.txt') self.config_file = os.path.join(bert_pretrained_dir, 'bert_config.json') self.tokenizer = FullTokenizer(vocab_file=self.vocab_file, do_lower_case=self.do_lower_case) self.input_id = tf.placeholder(tf.int64, [None, None], 'input_ids') self.input_mask = tf.placeholder(tf.int64, [None, None], 'input_mask') self.segment_ids = tf.placeholder(tf.int64, [None, None], 'segment_ids') bert_config = BertConfig.from_json_file(self.config_file) model = BertModel(config=bert_config, is_training=False, input_ids=self.input_id, input_mask=self.input_mask, token_type_ids=self.segment_ids, use_one_hot_embeddings=True, scope='bert') self.output_layer = model.get_sequence_output() self.embedding_layer = model.get_embedding_output() saver = tf.train.Saver() config = tf.ConfigProto() config.gpu_options.allow_growth = True self.session = tf.Session(config=config) saver.restore(self.session, bert_pretrained_dir + '/bert_model.ckpt') def tokenize(self, token_list, attributes_list): num_attributes = len(attributes_list) output_list = [[] for _ in range(num_attributes)] token_ids = [] masks = [] token_ids.append("[CLS]") for token_id, token in enumerate(token_list): new_tokens = self.tokenizer.tokenize(token) token_ids.extend(new_tokens) for att_id in range(num_attributes): l_ = [ attributes_list[att_id][token_id] for _ in range(len(new_tokens)) ] output_list[att_id].extend(l_) m = [0 for _ in range(len(new_tokens))] m[0] = 1 masks.extend(m) token_ids.append("[SEP]") token_ids = self.tokenizer.convert_tokens_to_ids(token_ids) last_layer, embedding = self.get_embeddings(token_ids) if len(last_layer) != len(output_list[0]): print(token_list) print(token_ids) for list_i in output_list: print(list_i) assert len(last_layer) == len(output_list[0]) return last_layer, embedding, token_ids[1:-1], output_list, masks def get_embeddings(self, token_ids): input_mask = [[1] * len(token_ids)] segment_ids = [[0] * len(token_ids)] input_id = [token_ids] outputs, emb = self.session.run( [self.output_layer, self.embedding_layer], feed_dict={ self.input_mask: input_mask, self.segment_ids: segment_ids, self.input_id: input_id }) return outputs[0][1:-1], emb[0][1:-1] def tokenize_sentence(self, token_list): token_ids = [] token_ids.append("[CLS]") for token_id, token in enumerate(token_list): new_tokens = self.tokenizer.tokenize(token) token_ids.extend(new_tokens) token_ids.append("[SEP]") token_ids = self.tokenizer.convert_tokens_to_ids(token_ids) return token_ids[1:-1]