def build(self): self.embedding_type = 'bert' config_path = os.path.join(self.corpus_path, 'bert_config.json') check_point_path = os.path.join(self.corpus_path, 'bert_model.ckpt') dict_path = os.path.join(self.corpus_path, 'vocab.txt') model = keras_bert.load_trained_model_from_checkpoint( config_path, check_point_path, seq_len=self.len_max) num_layers = len(model.layers) features_layers = [model.get_layer(index=num_layers-1+idx*8).output\ for idx in range(-3, 1)] embedding_layer = Concatenate(features_layers) output_layer = NonMaskingLayer()(embedding_layer) self.model = Model(model.inputs, output_layer) self.embedding_size = self.model.output_shape[-1] word2idx = {} with open(dict_path, 'r', encoding='utf-8') as f: words = f.read().splitlines() for idx, word in enumerate(words): word2idx[word] = idx for key, value in self.ot_dict.items(): word2idx[key] = word2idx[value] self.token2idx = word2idx # reader tokenizer self.token_dict = {} with codecs.open(dict_path, 'r', 'utf8') as reader: for line in reader: token = line.strip() self.token_dict[token] = len(self.token_dict) self.tokenizer = keras_bert.Tokenizer(self.token_dict)
def _build_token2idx_from_bert(self): dict_path = os.path.join(self.model_folder, 'vocab.txt') if not os.path.exists(dict_path): model_name = self.model_key_map.get(self.model_folder, 'chinese_L-12_H-768_A-12') url = self.pre_trained_models.get(model_name) get_file(model_name + ".zip", url, extract=True, cache_dir=text2vec.USER_DATA_DIR, cache_subdir=text2vec.USER_DATA_DIR, verbose=1) self.model_folder = os.path.join(text2vec.USER_DATA_DIR, model_name) dict_path = os.path.join(self.model_folder, 'vocab.txt') logger.debug(f'load vocab.txt from {dict_path}') token2idx = {} with codecs.open(dict_path, 'r', encoding='utf-8') as f: for line in f: token = line.strip() token2idx[token] = len(token2idx) self.bert_token2idx = token2idx self.tokenizer = keras_bert.Tokenizer(token2idx) self.processor.token2idx = self.bert_token2idx self.processor.idx2token = dict([(value, key) for key, value in token2idx.items()])
def build(self): from keras_textclassification.keras_layers.albert.albert import load_brightmart_albert_zh_checkpoint import keras_bert self.embedding_type = 'albert' dict_path = os.path.join(self.corpus_path, 'vocab.txt') print('load bert model start!') # 简要判别一下 self.layer_indexes = [i if i in [0,1,2,3,4,5,6,7,8,9,10,11, -1,-2] else -1 for i in self.layer_indexes] self.model = load_brightmart_albert_zh_checkpoint(self.corpus_path, training=self.trainable, seq_len=self.len_max, output_layers = self.layer_indexes) self.input = self.model.inputs self.output = self.model.outputs[0] # model_l = model.layers print('load bert model end!') # albert model all layers layer_dict = [8, 13] layer_0 = 13 for i in range(10): layer_0 = layer_0 + 2 layer_dict.append(layer_0) layer_dict.append(36) print(layer_dict) # 输出它本身 if len(self.layer_indexes) == 0: encoder_layer = self.model.output # 分类如果只有一层,就只取最后那一层的weight;取得不正确,就默认取最后一层 elif len(self.layer_indexes) == 1: if self.layer_indexes[0] in [i + 1 for i in range(13)]: encoder_layer = self.model.get_layer(index=layer_dict[self.layer_indexes[0] - 1]).output else: encoder_layer = self.model.get_layer(index=layer_dict[-1]).output # 否则遍历需要取的层,把所有层的weight取出来并拼接起来shape:768*层数 else: # layer_indexes must be [1,2,3,......12] # all_layers = [model.get_layer(index=lay).output if lay is not 1 else model.get_layer(index=lay).output[0] for lay in layer_indexes] all_layers = [self.model.get_layer(index=layer_dict[lay - 1]).output if lay in [i + 1 for i in range(13)] else self.model.get_layer(index=layer_dict[-1]).output # 如果给出不正确,就默认输出最后一层 for lay in self.layer_indexes] all_layers_select = [] for all_layers_one in all_layers: all_layers_select.append(all_layers_one) encoder_layer = Add()(all_layers_select) self.output = NonMaskingLayer()(encoder_layer) self.input = self.model.inputs self.model = Model(self.input, self.output) # self.embedding_size = self.model.output_shape[-1] # reader tokenizer self.token_dict = {} with codecs.open(dict_path, 'r', 'utf8') as reader: for line in reader: token = line.strip() self.token_dict[token] = len(self.token_dict) self.vocab_size = len(self.token_dict) self.tokenizer = keras_bert.Tokenizer(self.token_dict)
def build(self): import keras_bert self.embedding_type = 'bert' config_path = os.path.join(self.corpus_path, 'bert_config.json') check_point_path = os.path.join(self.corpus_path, 'bert_model.ckpt') dict_path = os.path.join(self.corpus_path, 'vocab.txt') # logger.info('load bert model start!') model = keras_bert.load_trained_model_from_checkpoint( config_path, check_point_path, seq_len=self.len_max, trainable=self.trainable) # logger.info('load bert model success!') # bert model all layers layer_dict = [6] layer_0 = 7 for i in range(12): layer_0 = layer_0 + 8 layer_dict.append(layer_0) logger.info(layer_dict) # 输出它本身 if len(self.layer_indexes) == 0: encoder_layer = model.output # 分类如果只有一层,就只取最后那一层的weight;取得不正确,就默认取最后一层 elif len(self.layer_indexes) == 1: if self.layer_indexes[0] in [i + 1 for i in range(13)]: encoder_layer = model.get_layer( index=layer_dict[self.layer_indexes[0] - 1]).output else: encoder_layer = model.get_layer(index=layer_dict[-1]).output # 否则遍历需要取的层,把所有层的weight取出来并拼接起来shape:768*层数 else: all_layers = [ model.get_layer(index=layer_dict[lay - 1]).output if lay in [i + 1 for i in range(13)] else model.get_layer( index=layer_dict[-1]).output # 如果给出不正确,就默认输出最后一层 for lay in self.layer_indexes ] all_layers_select = [] for all_layers_one in all_layers: all_layers_select.append(all_layers_one) encoder_layer = tf.keras.layers.Add( name="layer_add_bert")(all_layers_select) self.output = NonMaskingLayer( name="layer_non_masking_layer")(encoder_layer) self.input = model.inputs self.model = tf.keras.Model(self.input, self.output) self.embedding_size = self.model.output_shape[-1] # reader tokenizer self.token_dict = {} with codecs.open(dict_path, 'r', 'utf8') as reader: for line in reader: token = line.strip() self.token_dict[token] = len(self.token_dict) self.vocab_size = len(self.token_dict) self.tokenizer = keras_bert.Tokenizer(self.token_dict)
def build(self): from src.keras_layers.albert.albert import load_brightmart_albert_zh_checkpoint import keras_bert self.embedding_type = 'albert' dict_path = os.path.join(self.corpus_path, 'vocab.txt') print("Load Albert Model Start!") self.model = load_brightmart_albert_zh_checkpoint( self.corpus_path, training=self.trainable, seq_len=self.len_max, output_layers=None) config = {} for file_name in os.listdir(self.corpus_path): if file_name.startswith('bert_config.json'): with open(os.path.join(self.corpus_path, file_name)) as reader: config = json.load(reader) break num_hidden_layers = config.get('num_hidden_layers', 0) layer_real = [i for i in range(num_hidden_layers) ] + [-i for i in range(num_hidden_layers)] self.layer_indexes = [ i if i in layer_real else -2 for i in self.layer_indexes ] model_1 = self.model.layers print('load bert model end!') # 取出所有的albert的层 layer_dict = [4, 8, 11, 13] layer_0 = 13 for i in range(num_hidden_layers): layer_0 = layer_0 + 1 layer_dict.append(layer_0) print(layer_dict) if len(self.layer_indexes) == 0: encoder_layer = self.model.output # 取最后一层权重,默认最后一层 elif len(self.layer_indexes) == 1: all_layers = [ self.model.get_layer(index=layer_dict[lay]).output if lay in layer_real else self.model.get_layer( index=layer_dict[-2]).output for lay in self.layer_indexes ] all_layers_select = [] for all_layers_one in all_layers: all_layers_select.append(all_layers_one) encoder_layer = Add()(all_layers_select) self.output = NonMaskingLayer()(encoder_layer) self.input = self.model.inputs self.model = Model(self.input, self.output) self.token_dict = {} with codecs.open(dict_path, 'r', encoding='utf-8') as reader: for line in reader: token = line.strip() self.token_dict[token] = len(self.token_dict) self.vocab_size = len(self.token_dict) self.tokenizer = keras_bert.Tokenizer(self.token_dict)
def build(self): from macropodus.network.layers.albert import load_brightmart_albert_zh_checkpoint import keras_bert self.embedding_type = 'albert' dict_path = os.path.join(self.corpus_path, 'vocab.txt') # logger.info('load albert model start!') layer_real = [i for i in range(25)] + [-i for i in range(25)] # 简要判别一下 self.layer_indexes = [i if i in layer_real else -2 for i in self.layer_indexes] self.model = load_brightmart_albert_zh_checkpoint(self.corpus_path, training=self.trainable, seq_len=self.len_max, output_layers = None) # self.layer_indexes) # model_l = self.model.layers # logger.info('load albert model success!') # albert model all layers layer_dict = [4, 8, 11, 13] layer_0 = 13 for i in range(20): layer_0 = layer_0 + 1 layer_dict.append(layer_0) layer_dict.append(34) logger.info(layer_dict) # 输出它本身 if len(self.layer_indexes) == 0: encoder_layer = self.model.output # 分类如果只有一层,就只取最后那一层的weight;取得不正确,就默认取最后一层 elif len(self.layer_indexes) == 1: if self.layer_indexes[0] in layer_real: encoder_layer = self.model.get_layer(index=layer_dict[self.layer_indexes[0]]).output else: encoder_layer = self.model.get_layer(index=layer_dict[-2]).output # 否则遍历需要取的层,把所有层的weight取出来并拼接起来shape:768*层数 else: all_layers = [self.model.get_layer(index=layer_dict[lay]).output if lay in layer_real else self.model.get_layer(index=layer_dict[-2]).output # 如果给出不正确,就默认输出最后一层 for lay in self.layer_indexes] all_layers_select = [] for all_layers_one in all_layers: all_layers_select.append(all_layers_one) encoder_layer = tf.keras.layers.Add()(all_layers_select) output = NonMaskingLayer()(encoder_layer) self.output = [output] # self.output = [encoder_layer] self.input = self.model.inputs self.model = tf.keras.Model(self.input, self.output) # reader tokenizer self.token_dict = {} with codecs.open(dict_path, 'r', 'utf8') as reader: for line in reader: token = line.strip() self.token_dict[token] = len(self.token_dict) self.vocab_size = len(self.token_dict) self.tokenizer = keras_bert.Tokenizer(self.token_dict)
def _build_token2idx_from_bert(self): token2idx = {} with codecs.open(self.vocab_path, 'r', 'utf8') as reader: for line in reader: token = line.strip() token2idx[token] = len(token2idx) self.bert_token2idx = token2idx self._tokenizer = keras_bert.Tokenizer(token2idx) self.processor.token2idx = self.bert_token2idx self.processor.idx2token = dict([(value, key) for key, value in token2idx.items()])
def build(self): import keras_bert self.embedding_type = 'bert' config_path = os.path.join(self.corpus_path, 'bert_config.json') check_point_path = os.path.join(self.corpus_path, 'bert_model.ckpt') dict_path = os.path.join(self.corpus_path, 'vocab.txt') print("Load Bert Model Start!") model = keras_bert.load_trained_model_from_checkpoint( config_path, check_point_path, seq_len=self.len_max, trainable=self.trainable) print("loaded bert Model!") # 取出bert的所有的层 layer_dict = [6] layer_0 = 7 for i in range(12): layer_0 = layer_0 + 8 layer_dict.append(layer_0) print(layer_dict) if len(self.layer_indexes) == 0: encoder_layer = model.output # 分类如果只有一层,就只取最后一层的weight; elif len(self.layer_indexes) == 1: if self.layer_indexes[0] in [i + 1 for i in range(13)]: encoder_layer = model.get_layer( index=layer_dict[self.layer_indexes[0] - 1]).output else: encoder_layer = model.get_layer(index=layer_dict[-1]).output # 遍历需要的层, 所有层的weight取出来,拼接成:768*层数 else: all_layers = [ model.get_layer(index=layer_dict[lay - 1]).output if lay in [i + 1 for i in range(13)] else model.get_layer( index=layer_dict[-1]).output for lay in self.layer_indexes ] all_layers_select = [] for all_layers_one in all_layers: all_layers_select.append(all_layers_one) encoder_layer = Add()(all_layers_select) self.output = NonMaskingLayer()(encoder_layer) self.input = model.inputs self.model = Model(self.input, self.output) self.embedding_size = self.model.output_shape[-1] self.token_dict = {} with codecs.open(dict_path, 'r', encoding='utf-8') as reader: for line in reader: token = line.strip() self.token_dict[token] = len(self.token_dict) self.vocab_size = len(self.token_dict) self.tokenizer = keras_bert.Tokenizer(self.token_dict)
def _build_token2idx_from_bert(self): dict_path = os.path.join(self.model_folder, 'vocab.txt') token2idx = {} with codecs.open(dict_path, 'r', 'utf8') as reader: for line in reader: token = line.strip() token2idx[token] = len(token2idx) self.bert_token2idx = token2idx # 传入字典,带有 encode 和 decode 方法 self._tokenizer = keras_bert.Tokenizer(token2idx) self.processor.token2idx = self.bert_token2idx self.processor.idx2token = dict([(value, key) for key, value in token2idx.items()])
def get_tokenizer() -> keras_bert.Tokenizer: ''' Returns the Bert tokenizer. ''' cache_f = Path('datasets/bert_tokenizer.bin') if cache_f.exists(): print('Found cached tokenizer.') with cache_f.open('r+b') as f: tokenizer = pickle.load(f) return tokenizer print('Building tokenizer.') vocab_path = get_bert_model_dir() / 'vocab.txt' token_dict = {} with codecs.open(str(vocab_path), 'r', 'utf8') as reader: for line in reader: token = line.strip() token_dict[token] = len(token_dict) at_unused = 0 def next_unused() -> str: nonlocal at_unused ret = f'[unused{at_unused}]' at_unused += 1 return ret def add_token_to_vocab(token: str): to_replace = next_unused() assert to_replace in token_dict token_dict[token] = token_dict[to_replace] del token_dict[to_replace] add_token_to_vocab('<user>') add_token_to_vocab('<url>') add_token_to_vocab('<num>') tokenizer = keras_bert.Tokenizer(token_dict) with cache_f.open('w+b') as f: pickle.dump(tokenizer, f) return tokenizer
def build(self): from keras_textclassification.keras_layers.albert.albert import load_brightmart_albert_zh_checkpoint import keras_bert self.embedding_type = 'albert' dict_path = os.path.join(self.corpus_path, 'vocab.txt') print('load bert model start!') # 简要判别一下 # self.layer_indexes = [i if i in layer_real else -2 for i in self.layer_indexes] self.model = load_brightmart_albert_zh_checkpoint( self.corpus_path, training=self.trainable, seq_len=self.len_max, output_layers=None) # self.layer_indexes) import json config = {} for file_name in os.listdir(self.corpus_path): if file_name.startswith('albert_config_base.json'): with open(os.path.join(self.corpus_path, file_name)) as reader: config = json.load(reader) break num_hidden_layers = config.get("num_hidden_layers", 0) layer_real = [i for i in range(num_hidden_layers) ] + [-i for i in range(num_hidden_layers)] self.layer_indexes = [ i if i in layer_real else -2 for i in self.layer_indexes ] # self.input = self.model.inputs # self.output = self.model.outputs[0] model_l = self.model.layers print('load bert model end!') # albert model all layers layer_dict = [4, 8, 11, 13] layer_0 = 13 for i in range(num_hidden_layers): layer_0 = layer_0 + 1 layer_dict.append(layer_0) # layer_dict.append(34) print(layer_dict) # 输出它本身 if len(self.layer_indexes) == 0: encoder_layer = self.model.output # 分类如果只有一层,就只取最后那一层的weight;取得不正确,就默认取最后一层 elif len(self.layer_indexes) == 1: if self.layer_indexes[0] in layer_real: encoder_layer = self.model.get_layer( index=layer_dict[self.layer_indexes[0]]).output else: encoder_layer = self.model.get_layer( index=layer_dict[-2]).output # 否则遍历需要取的层,把所有层的weight取出来并拼接起来shape:768*层数 else: # layer_indexes must be [1,2,3,......12] # all_layers = [model.get_layer(index=lay).output if lay is not 1 else model.get_layer(index=lay).output[0] for lay in layer_indexes] all_layers = [ self.model.get_layer(index=layer_dict[lay]).output if lay in layer_real else self.model.get_layer( index=layer_dict[-2]).output # 如果给出不正确,就默认输出最后一层 for lay in self.layer_indexes ] all_layers_select = [] for all_layers_one in all_layers: all_layers_select.append(all_layers_one) encoder_layer = Add()(all_layers_select) self.output = NonMaskingLayer()(encoder_layer) self.input = self.model.inputs self.model = Model(self.input, self.output) # self.embedding_size = self.model.output_shape[-1] # reader tokenizer self.token_dict = {} with codecs.open(dict_path, 'r', 'utf8') as reader: for line in reader: token = line.strip() self.token_dict[token] = len(self.token_dict) self.vocab_size = len(self.token_dict) self.tokenizer = keras_bert.Tokenizer(self.token_dict)
def build(self): import keras_bert self.embedding_type = 'bert' config_path = os.path.join(self.corpus_path, 'bert_config.json') check_point_path = os.path.join(self.corpus_path, 'bert_model.ckpt') dict_path = os.path.join(self.corpus_path, 'vocab.txt') print('load bert model start!') model = keras_bert.load_trained_model_from_checkpoint( config_path, check_point_path, seq_len=self.len_max, trainable=self.trainable) print('load bert model end!') # bert model all layers layer_dict = [6] layer_0 = 7 for i in range(12): layer_0 = layer_0 + 8 layer_dict.append(layer_0) print(layer_dict) # 输出它本身 if len(self.layer_indexes) == 0: encoder_layer = model.output # 分类如果只有一层,就只取最后那一层的weight;取得不正确,就默认取最后一层 elif len(self.layer_indexes) == 1: if self.layer_indexes[0] in [i + 1 for i in range(13)]: encoder_layer = model.get_layer( index=layer_dict[self.layer_indexes[0] - 1]).output else: encoder_layer = model.get_layer(index=layer_dict[-1]).output # 否则遍历需要取的层,把所有层的weight取出来并拼接起来shape:768*层数 else: # layer_indexes must be [1,2,3,......12] # all_layers = [model.get_layer(index=lay).output if lay is not 1 else model.get_layer(index=lay).output[0] for lay in layer_indexes] all_layers = [ model.get_layer(index=layer_dict[lay - 1]).output if lay in [i + 1 for i in range(13)] else model.get_layer( index=layer_dict[-1]).output # 如果给出不正确,就默认输出最后一层 for lay in self.layer_indexes ] all_layers_select = [] for all_layers_one in all_layers: all_layers_select.append(all_layers_one) encoder_layer = Add()(all_layers_select) self.output = NonMaskingLayer()(encoder_layer) self.input = model.inputs self.model = Model(self.input, self.output) self.embedding_size = self.model.output_shape[-1] # word2idx = {} # with open(dict_path, 'r', encoding='utf-8') as f: # words = f.read().splitlines() # for idx, word in enumerate(words): # word2idx[word] = idx # for key, value in self.ot_dict.items(): # word2idx[key] = value # # self.token2idx = word2idx # reader tokenizer self.token_dict = {} with codecs.open(dict_path, 'r', 'utf8') as reader: for line in reader: token = line.strip() self.token_dict[token] = len(self.token_dict) self.vocab_size = len(self.token_dict) self.tokenizer = keras_bert.Tokenizer(self.token_dict)
import keras_bert token_dict = { '[CLS]': 0, '[SEP]': 1, 'un': 2, '##aff': 3, '##able': 4, '[UNK]': 5, } tokenizer = keras_bert.Tokenizer(token_dict) print(tokenizer.tokenize('unaffable') ) # The result should be `['[CLS]', 'un', '##aff', '##able', '[SEP]']` indices, segments = tokenizer.encode('unaffable') print(indices) # Should be `[0, 2, 3, 4, 1]` print(segments) # Should be `[0, 0, 0, 0, 0]` print(tokenizer.tokenize(first='unaffable', second='钢')) # The result should be `['[CLS]', 'un', '##aff', '##able', '[SEP]', '钢', '[SEP]']` indices, segments = tokenizer.encode(first='unaffable', second='钢', max_len=10) print(indices) # Should be `[0, 2, 3, 4, 1, 5, 1, 0, 0, 0]` print(segments) # Should be `[0, 0, 0, 0, 0, 1, 1, 0, 0, 0]`