def get_bert_tensorflow_hub_model( max_seq_length=128, module_hub_url="https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1" ): input_word_ids = tf.keras.layers.Input(shape=(max_seq_length, ), dtype=tf.int32, name="input_word_ids") input_mask = tf.keras.layers.Input(shape=(max_seq_length, ), dtype=tf.int32, name="input_mask") segment_ids = tf.keras.layers.Input(shape=(max_seq_length, ), dtype=tf.int32, name="segment_ids") bert_layer = hub.KerasLayer(module_hub_url, trainable=True) #bert_layer = hub.KerasLayer("C:/sc/sync/projects/00model/bert/uncased_new", trainable=True) pooled_output, sequence_output = bert_layer( [input_word_ids, input_mask, segment_ids]) vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy() do_lower_case = bert_layer.resolved_object.do_lower_case.numpy() model = Model(inputs=[input_word_ids, input_mask, segment_ids], outputs=[pooled_output, sequence_output]) tokenizer = bert_tokenization.FullTokenizer(vocab_file, do_lower_case) return model, tokenizer
def create_tokenizer(model_dir=args.model_dir, do_lower_case=args.do_lower_case, name='bert'): if name == 'bert': bert.bert_tokenization.validate_case_matches_checkpoint(args.do_lower_case, op.join(model_dir, 'bert_model.ckpt')) return bert_tokenization.FullTokenizer(vocab_file=op.join(model_dir, 'vocab.txt'), do_lower_case=do_lower_case) raise NotImplemented("* available tokenizers: [ bert, ]")
def createTokenizer(): currentDir = os.path.dirname(os.path.realpath(__file__)) modelsFolder = os.path.join(currentDir, "models", "multi_cased_L-12_H-768_A-12") vocab_file = os.path.join(modelsFolder, "vocab.txt") tokenizer = bert_tokenization.FullTokenizer(vocab_file, do_lower_case=True) return tokenizer
def __init__(self, vocab_file, do_lower_case, max_seq_length, sentence_column, second_sentence_column=None): self.vocab_file = vocab_file self.do_lower_case = do_lower_case tokenizer = bert_tokenization.FullTokenizer( vocab_file.asset_path.numpy(), do_lower_case.numpy()) super().__init__(tokenizer, max_seq_length, sentence_column, second_sentence_column)
def encode( ): tokenizer = bert_tokenization.FullTokenizer(vocab_file=BERT_VOCAB, do_lower_case=True) inferModel = load_model('docs/saved_model.hdf5', custom_objects={'KerasLayer':hub.KerasLayer}) embedding_model = Model(inputs= inferModel.inputs, outputs = inferModel.layers[6].output[0]) q_df = pd.read_csv('./data/doc_repository.txt').dropna() q_df.columns = ['question'] uni_q = q_df['question'].unique().tolist() left_tokens = get_tokens(uni_q) right = [' ' for i in range(len(uni_q))] right_tokens = get_tokens(right) input_ids_left, input_masks_left, segment_ids_left = get_input_matrix(left_tokens) input_ids_right, input_masks_right, segment_ids_right= get_input_matrix(right_tokens) q_embedding = embedding_model.predict([input_ids_left, input_masks_left, segment_ids_left,input_ids_right, input_masks_right, segment_ids_right]) embedding_df = pd.DataFrame(q_embedding) question_embedding_df = q_df.merge(embedding_df,left_index = True,right_index = True) question_embedding_df.to_csv('./docs/question_embedding.csv',index =False, header = False) return
def __init__(self, bert_layer, max_seq_length=128, lr=0.0001, epochs=3, batch_size=32): # BERT and Tokenization params self.bert_layer = bert_layer self.max_seq_length = max_seq_length vocab_file = self.bert_layer.resolved_object.vocab_file.asset_path.numpy( ) do_lower_case = self.bert_layer.resolved_object.do_lower_case.numpy() self.tokenizer = bt.FullTokenizer(vocab_file, do_lower_case) # Learning control params self.lr = lr self.epochs = epochs self.batch_size = batch_size self.models = [] self.scores = {}
# + input_word_ids = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32, name="input_word_ids") input_mask = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32, name="input_mask") segment_ids = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32, name="segment_ids") bert_layer = hub.KerasLayer(module_hub_url, trainable=True) #bert_layer = hub.KerasLayer("C:/sc/sync/projects/00model/bert/uncased_new", trainable=True) pooled_output, sequence_output = bert_layer([input_word_ids, input_mask, segment_ids]) vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy() do_lower_case = bert_layer.resolved_object.do_lower_case.numpy() model = Model(inputs=[input_word_ids, input_mask, segment_ids], outputs=[pooled_output, sequence_output]) tokenizer = bert_tokenization.FullTokenizer(vocab_file, do_lower_case) # - # + stokens = tokenizer.tokenize(str_test) #stokens = ["[CLS]"] + stokens + ["[SEP]"] stokens = stokens