예제 #1
0
def get_bert_tensorflow_hub_model(
    max_seq_length=128,
    module_hub_url="https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1"
):

    input_word_ids = tf.keras.layers.Input(shape=(max_seq_length, ),
                                           dtype=tf.int32,
                                           name="input_word_ids")
    input_mask = tf.keras.layers.Input(shape=(max_seq_length, ),
                                       dtype=tf.int32,
                                       name="input_mask")
    segment_ids = tf.keras.layers.Input(shape=(max_seq_length, ),
                                        dtype=tf.int32,
                                        name="segment_ids")

    bert_layer = hub.KerasLayer(module_hub_url, trainable=True)
    #bert_layer = hub.KerasLayer("C:/sc/sync/projects/00model/bert/uncased_new", trainable=True)

    pooled_output, sequence_output = bert_layer(
        [input_word_ids, input_mask, segment_ids])
    vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
    do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()

    model = Model(inputs=[input_word_ids, input_mask, segment_ids],
                  outputs=[pooled_output, sequence_output])
    tokenizer = bert_tokenization.FullTokenizer(vocab_file, do_lower_case)

    return model, tokenizer
예제 #2
0
def create_tokenizer(model_dir=args.model_dir, do_lower_case=args.do_lower_case, name='bert'):
    if name == 'bert':
        bert.bert_tokenization.validate_case_matches_checkpoint(args.do_lower_case,
                                                                op.join(model_dir, 'bert_model.ckpt'))
        return bert_tokenization.FullTokenizer(vocab_file=op.join(model_dir, 'vocab.txt'),
                                               do_lower_case=do_lower_case)
    raise NotImplemented("* available tokenizers: [ bert, ]")
def createTokenizer():
	currentDir = os.path.dirname(os.path.realpath(__file__))
	modelsFolder = os.path.join(currentDir, "models", "multi_cased_L-12_H-768_A-12")
	vocab_file = os.path.join(modelsFolder, "vocab.txt")

	tokenizer = bert_tokenization.FullTokenizer(vocab_file, do_lower_case=True)
	return tokenizer
예제 #4
0
    def __init__(self,
                 vocab_file,
                 do_lower_case,
                 max_seq_length,
                 sentence_column,
                 second_sentence_column=None):
        self.vocab_file = vocab_file
        self.do_lower_case = do_lower_case
        tokenizer = bert_tokenization.FullTokenizer(
            vocab_file.asset_path.numpy(), do_lower_case.numpy())

        super().__init__(tokenizer, max_seq_length, sentence_column,
                         second_sentence_column)
def encode( ):
    tokenizer = bert_tokenization.FullTokenizer(vocab_file=BERT_VOCAB, do_lower_case=True)
    inferModel = load_model('docs/saved_model.hdf5', custom_objects={'KerasLayer':hub.KerasLayer})
    embedding_model = Model(inputs= inferModel.inputs, outputs = inferModel.layers[6].output[0])
    q_df = pd.read_csv('./data/doc_repository.txt').dropna()
    q_df.columns = ['question']
    uni_q = q_df['question'].unique().tolist()
    left_tokens = get_tokens(uni_q)
    right = [' ' for i in range(len(uni_q))]
    right_tokens = get_tokens(right)
    input_ids_left, input_masks_left, segment_ids_left = get_input_matrix(left_tokens)
    input_ids_right, input_masks_right, segment_ids_right= get_input_matrix(right_tokens)
    q_embedding = embedding_model.predict([input_ids_left, input_masks_left, segment_ids_left,input_ids_right, input_masks_right, segment_ids_right])
    embedding_df = pd.DataFrame(q_embedding)
    question_embedding_df = q_df.merge(embedding_df,left_index = True,right_index = True)
    question_embedding_df.to_csv('./docs/question_embedding.csv',index =False, header = False)
    return
    def __init__(self,
                 bert_layer,
                 max_seq_length=128,
                 lr=0.0001,
                 epochs=3,
                 batch_size=32):

        # BERT and Tokenization params
        self.bert_layer = bert_layer

        self.max_seq_length = max_seq_length
        vocab_file = self.bert_layer.resolved_object.vocab_file.asset_path.numpy(
        )
        do_lower_case = self.bert_layer.resolved_object.do_lower_case.numpy()
        self.tokenizer = bt.FullTokenizer(vocab_file, do_lower_case)

        # Learning control params
        self.lr = lr
        self.epochs = epochs
        self.batch_size = batch_size

        self.models = []
        self.scores = {}
예제 #7
0

# +
input_word_ids = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32, name="input_word_ids")
input_mask = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32, name="input_mask")
segment_ids = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32, name="segment_ids")

bert_layer = hub.KerasLayer(module_hub_url, trainable=True)
#bert_layer = hub.KerasLayer("C:/sc/sync/projects/00model/bert/uncased_new", trainable=True)

pooled_output, sequence_output = bert_layer([input_word_ids, input_mask, segment_ids])
vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()

model = Model(inputs=[input_word_ids, input_mask, segment_ids], outputs=[pooled_output, sequence_output])
tokenizer = bert_tokenization.FullTokenizer(vocab_file, do_lower_case)
# -








# +


stokens = tokenizer.tokenize(str_test)
#stokens = ["[CLS]"] + stokens + ["[SEP]"]
stokens = stokens