Ejemplo n.º 1
0
    def run(self):
        #set enviornment
        os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
        os.environ["CUDA_VISIBLE_DEVICES"] = str(self._gpuid)

        #load models
        #every worker only need to load model one time
        paths = get_checkpoint_paths(self._bert_checkpoint)
        model = load_trained_model_from_checkpoint(
            config_file=paths.config,
            checkpoint_file=paths.checkpoint,
            output_layer_num=1,
        )
        vocabs = load_vocabulary(paths.vocab)
        print('model init done', self._gpuid)

        while True:
            xfile = self._queue.get()
            if xfile == None:
                self._queue.put(None)
                break
            embeddings = extract_embeddings(model=model,
                                            vocabs=vocabs,
                                            texts=xfile[1],
                                            output_layer_num=1,
                                            poolings=[POOL_NSP, POOL_MAX])
            print('woker running', self._gpuid, len(self.return_list))
            self.return_list.append({
                'worker': self._gpuid,
                'id': xfile[0],
                'content': xfile[1],
                'embeddings': embeddings
            })

        print('worker predict done at gpu:', self._gpuid)
def download_pretrained_bert(language_backbone='chinese_wwm_base'):
    base_model_path = {
        'multi_cased_base':
        PretrainedList.multi_cased_base,
        'chinese_base':
        PretrainedList.chinese_base,
        'wwm_uncased_large':
        PretrainedList.wwm_uncased_large,
        'wwm_cased_large':
        PretrainedList.wwm_cased_large,
        'chinese_wwm_base':
        'https://storage.googleapis.com/bert_models/2018_11_03/chinese_L-12_H-768_A-12.zip',
        'bert_base_cased':
        'https://storage.googleapis.com/bert_models/2018_10_18/cased_L-12_H-768_A-12.zip',
        'bert_large_cased':
        'https://storage.googleapis.com/bert_models/2018_10_18/cased_L-24_H-1024_A-16.zip',
        'bert_base_uncased':
        'https://storage.googleapis.com/bert_models/2018_10_18/uncased_L-12_H-768_A-12.zip',
        'bert_large_uncased':
        'https://storage.googleapis.com/bert_models/2018_10_18/uncased_L-24_H-1024_A-16.zip'
    }
    model_path = get_pretrained(base_model_path[language_backbone.lower()])
    paths = get_checkpoint_paths(model_path)
    print(paths)
    return paths
Ejemplo n.º 3
0
 def load_model(self):
     tf.keras.backend.clear_session()
     logging.info("Loading RuBERT model...")
     paths = get_checkpoint_paths("model_bert")
     inputs = load_trained_model_from_checkpoint(
         config_file=paths.config,
         checkpoint_file=paths.checkpoint, seq_len=50)
     outputs = MaskedGlobalMaxPool1D(name="Pooling")(inputs.output)
     vocab = load_vocabulary(paths.vocab)
     return tf.keras.Model(inputs=inputs.inputs,
                           outputs=outputs), vocab, Tokenizer(vocab)
Ejemplo n.º 4
0
 def __init__(self, docs, vec):
     self.texts = np.array(docs)
     self.vec = vec
     paths = get_checkpoint_paths(".")
     inputs = load_trained_model_from_checkpoint(
         config_file=paths.config,
         checkpoint_file=paths.checkpoint,
         seq_len=50)
     outputs = MaskedGlobalMaxPool1D(name='Pooling')(inputs.output)
     self.model = Model(inputs=inputs.inputs, outputs=outputs)
     self.vocab = load_vocabulary(paths.vocab)
     self.tokenizer = Tokenizer(self.vocab)
Ejemplo n.º 5
0
    def __init__(self, config):
        model_path = config["model_path"]
        if not os.path.exists(model_path):
            model_dir = os.path.dirname(model_path)
            if not os.path.exists(model_dir):
                os.makedirs(model_dir)
            subprocess.run(
                f"wget -P {model_dir} {MODEL_URL} && cd {model_dir} && unzip chinese_wwm_L-12_H-768_A-12.zip",
                shell=True)

        paths = get_checkpoint_paths(model_path)
        self.model = load_trained_model_from_checkpoint(
            config_file=paths.config,
            checkpoint_file=paths.checkpoint,
            output_layer_num=1)
        self.vocabs = load_vocabulary(paths.vocab)
Ejemplo n.º 6
0
def load():
    global test_tables
    test_table_file = '../data/val.tables.json'
    bert_model_path = '../model'
    test_tables = read_tables(test_table_file)
    paths = get_checkpoint_paths(bert_model_path)
    global label_encoder
    label_encoder = SqlLabelEncoder()
    global query_tokenizer
    model, query_tokenizer = construct_model(paths)
    model_path = '../task1_best_model.h5'
    model.load_weights(model_path)
    global tokenizer
    model2, tokenizer = construct_model2(paths)
    model2.load_weights('../model_best_weights.h5')
    global models
    models = {}
    models['stage1'] = model
    models['stage2'] = model2
    global graph
    graph = tf.get_default_graph()
Ejemplo n.º 7
0
# In[ ]:

train_table_file = '../data/train/train.tables.json'
train_data_file = '../data/train/train.json'

val_table_file = '../data/val/val.tables.json'
val_data_file = '../data/val/val.json'

test_table_file = '../data/test/test.tables.json'
test_data_file = '../data/test/test.json'

# Download pretrained BERT model from https://github.com/ymcui/Chinese-BERT-wwm
bert_model_path = '../model/chinese_wwm_L-12_H-768_A-12'

paths = get_checkpoint_paths(bert_model_path)

task1_file = '../submit/task1_output.json'

# ## Read Data

# In[ ]:

train_tables = read_tables(train_table_file)
train_data = read_data(train_data_file, train_tables)

val_tables = read_tables(val_table_file)
val_data = read_data(val_data_file, val_tables)

test_tables = read_tables(test_table_file)
test_data = read_data(test_data_file, test_tables)
Ejemplo n.º 8
0
import sys
import numpy as np
from keras_bert import load_vocabulary, load_trained_model_from_checkpoint, Tokenizer, get_checkpoint_paths

print(
    'This demo demonstrates how to load the pre-trained model and extract word embeddings'
)

if len(sys.argv) == 2:
    model_path = sys.argv[1]
else:
    from keras_bert.datasets import get_pretrained, PretrainedList
    model_path = get_pretrained(PretrainedList.chinese_base)

paths = get_checkpoint_paths(model_path)

model = load_trained_model_from_checkpoint(paths.config,
                                           paths.checkpoint,
                                           seq_len=10)
model.summary(line_length=120)

token_dict = load_vocabulary(paths.vocab)

tokenizer = Tokenizer(token_dict)
text = '语言模型'
tokens = tokenizer.tokenize(text)
print('Tokens:', tokens)
indices, segments = tokenizer.encode(first=text, max_len=10)

predicts = model.predict([np.array([indices]), np.array([segments])])[0]
for i, token in enumerate(tokens):
Ejemplo n.º 9
0
#!/usr/bin/python3
# import os
# os.environ['TF_KERAS'] = '1'
# import tensorflow as tf
import keras
from keras import backend as K
from keras_bert import load_vocabulary, Tokenizer, get_checkpoint_paths, load_model_weights_from_checkpoint
from keras_bert.layers import TokenEmbedding, PositionEmbedding
import json
from data_generator import load_data, convert_to_sample, DataGenerator
import numpy as np
from tqdm import tqdm

pretrained_path = "/Users/weisu.yxd/Code/bert/chinese_L-12_H-768_A-12"
# pretrained_path = "chinese_L-12_H-768_A-12"
paths = get_checkpoint_paths(pretrained_path)
token_dict = load_vocabulary(paths.vocab)
mask_id = token_dict.get("[MASK]")

tokenizer = Tokenizer(token_dict)
id2token = {j: i for i, j in token_dict.items()}
char_start_index = 670
char_end_index = 7991


def get_model_from_embedding(inputs,
                             embed_layer,
                             transformer_num=12,
                             head_num=12,
                             feed_forward_dim=3072,
                             dropout_rate=0.1,
Ejemplo n.º 10
0
from nl2sql_model.utils import read_data, read_tables, SQL, MultiSentenceTokenizer, Query, Question, Table
from nl2sql_model.utils.optimizer import RAdam

# ---------------------------------加载数据---------------------------------
train_table_file = 'G:\\datas\\nl2sql\\TableQA-master\\train\\train.tables.json'
train_data_file = 'G:\\datas\\nl2sql\\TableQA-master\\train\\train.json'

val_table_file = 'G:\\datas\\nl2sql\\TableQA-master\\val\\val.tables.json'
val_data_file = 'G:\\datas\\nl2sql\\TableQA-master\\val\\val.json'

test_table_file = 'G:\\datas\\nl2sql\\TableQA-master\\test\\test.tables.json'
test_data_file = 'G:\\datas\\nl2sql\\TableQA-master\\test\\test.json'

# Download pretrained BERT model from https://github.com/ymcui/Chinese-BERT-wwm
bert_model_path = 'G:\\datas\\nl2sql\\chinese_wwm_L-12_H-768_A-12'
paths = get_checkpoint_paths(bert_model_path)  # 该类作用是获得保存节点文件的状态

# ---------------------------------数据预处理---------------------------------
train_tables = read_tables(train_table_file)

# 此处进行连表操作
train_data = read_data(train_data_file,
                       train_tables)  # # 将question/sql/table关联到一起

val_tables = read_tables(val_table_file)
val_data = read_data(val_data_file, val_tables)

test_tables = read_tables(test_table_file)
test_data = read_data(test_data_file, test_tables)

sample_query = train_data[2]
Ejemplo n.º 11
0
    
    output = keras.layers.Conv1D(32,2,activation = 'tanh')(output)
    output = keras.layers.AveragePooling1D(2,strides=1)(output)
    output = keras.layers.Conv1D(64,3,activation = 'tanh')(output)
    output = keras.layers.AveragePooling1D(2,strides=1)(output)
    output = keras.layers.Conv1D(64,4,activation = 'tanh')(output)
    output = keras.layers.AveragePooling1D(4,strides=1)(output)
    
    output = keras.layers.Flatten()(output)
    output_y = keras.layers.Dense(count, activation='softmax')(output) #new softmax layer
    model = keras.Model(base_model.input, output_y)
    # summarize the model
    model.summary()
    return model

checkpoint_paths = keras_bert.get_checkpoint_paths('./chinese_L-12_H-768_A-12')
token_dict = keras_bert.loader.load_vocabulary(checkpoint_paths.vocab)
tokenizer = keras_bert.tokenizer.Tokenizer(token_dict)


# define documents
max_labels = 0;
x_tokens = []
x_segments = []
y = []
labels = []
with open('./datas/questions.json') as fp:
    loaded_json = json.load(fp)    
    for doc in loaded_json:        
        labels.append(doc['label'])        
        for q in doc['questions']:    
Ejemplo n.º 12
0
 def load_bert_model(self, model_path):
     paths = get_checkpoint_paths(model_path)
     self.bert_model = load_trained_model_from_checkpoint(paths.config, paths.checkpoint,
                                                          training=False, seq_len=self.max_seq_len)
Ejemplo n.º 13
0
inputs = np.load('../data/coco_korean/coco_korean_tokens.npy')
# inputs = tf.convert_to_tensor(tokens, dtype=tf.int32)
print(inputs.shape)
print(inputs[:2])

segments = np.ones_like(inputs)

# model = BertModel(config, False, inputs)

ckpt = '../bert_eojeol/'
# model = tf.keras.Model()
# checkpoint = tf.train.Checkpoint(model=model)
# checkpoint.restore(tf.train.latest_checkpoint(ckpt))
# print(model.layers)

paths = get_checkpoint_paths(ckpt)
model = load_trained_model_from_checkpoint(paths.config,
                                           paths.checkpoint,
                                           training=False,
                                           seq_len=142)
# model.summary()
# model.save('koreanbert.h5')

dataset = tf.data.Dataset.from_tensor_slices((inputs, segments))
dataset = dataset.batch(5000)

outputs = np.empty((inputs.shape[0], 768))
i = 0
for data in dataset:
    inp = data[0]
    seg = data[1]