def clean_sub_word_sentence(word_ids: np.array, bpemb: BPEmb): # Extra padding token is remove in BPEmb word_ids = word_ids - 1 try: index = list(word_ids).index(bpemb.EOS) words = bpemb.decode_ids(word_ids[:index]) except ValueError: # No EOS found in sequence words = bpemb.decode_ids(word_ids) return words
class SubWordVocab(object): def __init__(self, size): self.encoder = BPEmb(lang='en', vs=size) assert self.sos_id == 1 assert self.eos_id == 2 def __len__(self): return self.encoder.vs @property def sos_id(self): return 1 @property def eos_id(self): return self.encoder.EOS def encode(self, syms): return self.encoder.encode_ids(syms) def decode(self, ids): syms = self.encoder.decode_ids(ids) if isinstance(syms, list): return '' return syms
def test_multi_language(): text = ["This is Stratford", "Kitap okuyordu."] bpemb_multi = BPEmb(lang="multi", add_pad_emb=True) print(bpemb_multi.encode_ids_with_bos_eos(text)) print( bpemb_multi.decode_ids([[1, 5496, 200, 23866, 3927, 2], [1, 45350, 44934, 67191, 94777, 2]]))
def test_decoding(): # Although <pad> word is added, when decoding it can't handle. Therefore, remove padding before decoding. # Decoding removes start/end tokens. bpemb_en = BPEmb(lang="en", add_pad_emb=True) # ids = [1, 215, 80, 8526, 1221, 2] ids = [[1, 215, 80, 8526, 1221, 2], [1, 215, 80, 8526, 1221, 2]] # ids = [1, 215, 80, 8526, 1221, 2, 10000, 10000] # print(bpemb_en.vectors[10000]) print(bpemb_en.decode_ids(ids))
class BPembTokenizer(Tokenizer): def __init__(self, vocab_size=50000, emb_dim=300, lang='en'): super(BPembTokenizer, self).__init__() from bpemb import BPEmb self.bpemb_en = BPEmb(lang=lang, vs=vocab_size, dim=emb_dim) def get_embeddings(self): return self.bpemb_en.vectors def encode_ids(self, text): return self.bpemb_en.encode_ids(text) def decode_ids(self, ids): return self.bpemb_en.decode_ids(ids) def tokenize(self, text): return self.bpemb_en.encode(text)
class Predictor(PredictorBase): def __init__(self, config): super(Predictor, self).__init__(config) self.config = config self.model = None self.sess = None # self.builder = tf.saved_model.builder.SavedModelBuilder("savedModel") if self.config["use_bpe"]: self.bpe_zh = BPEmb(lang="zh", vs=config["vocab_size"]) else: # 加载词汇表 self.word_to_idx = self.load_vocab() self.idx_to_label = {value: key for key, value in self.word_to_idx.items()} # 初始化模型 self.create_model() print("load model finished") # 加载计算图 self.load_graph() print("load graph finished") def load_vocab(self): # 将词汇-索引映射表加载出来 with open(os.path.join(self.output_path, "word_to_index.pkl"), "rb") as f: word_to_index = pickle.load(f) return word_to_index def sentence_to_encode(self, sentence): """ 创建数据对象 :return: """ if not sentence: return None if len(sentence) > 20: return None if self.config["use_bpe"]: word_idx = self.bpe_zh.encode_ids(sentence) word_idx = list(map(lambda x: x + 1, word_idx)) else: word_idx = [self.word_to_idx.get(token, self.word_to_idx["UNK"]) for token in sentence] new_word_idx = self.process_data(word_idx) return new_word_idx @staticmethod def process_data(sentence): """ 对数据做预处理 :param sentence: :return: """ encoder_inputs = [sentence] return dict(encoder_inputs=encoder_inputs) def response(self, tokens_list): sents = [] for i in range(self.config["beam_size"]): sent_token = tokens_list[:, i] if self.config["use_bpe"]: sent = self.bpe_zh.decode_ids(list(map(lambda x: x - 1, sent_token))) else: sent = "".join([self.idx_to_label[token] for token in sent_token]) sents.append(sent) return sents def create_model(self): """ 根据config文件选择对应的模型,并初始化 :return: """ if self.config["model_name"] == "seq2seq_lstm": self.model = Seq2SeqTransformer(config=self.config, vocab_size=len(self.word_to_idx), word_vectors=None) if self.config["model_name"] == "seq2seq_bilstm": self.model = Seq2SeqBiLstmModel(config=self.config, vocab_size=len(self.word_to_idx), word_vectors=None) def load_graph(self): """ 加载计算图 :return: """ self.sess = tf.Session() ckpt = tf.train.get_checkpoint_state(os.path.join(os.path.abspath(os.path.dirname(os.getcwd())), self.config["ckpt_model_path"])) if ckpt and tf.train.checkpoint_exists(ckpt.model_checkpoint_path): print('Reloading model parameters..') self.model.saver.restore(self.sess, ckpt.model_checkpoint_path) else: raise ValueError('No such file:[{}]'.format(self.config["ckpt_model_path"])) # inputs = {"inputs": tf.saved_model.utils.build_tensor_info(self.model.encoder_inputs), # "inputs_length": tf.saved_model.utils.build_tensor_info(self.model.encoder_inputs_length), # "keep_prob": tf.saved_model.utils.build_tensor_info(self.model.keep_prob)} # # outputs = {"predictions": tf.saved_model.utils.build_tensor_info(self.model.predictions)} # # prediction_signature = tf.saved_model.signature_def_utils.build_signature_def(inputs=inputs, # outputs=outputs, # method_name=tf.saved_model.signature_constants.PREDICT_METHOD_NAME) # legacy_init_op = tf.group(tf.tables_initializer(), name="legacy_init_op") # self.builder.add_meta_graph_and_variables(self.sess, [tf.saved_model.tag_constants.SERVING], # signature_def_map={"dialogue": prediction_signature}, # legacy_init_op=legacy_init_op) # self.builder.save() def predict(self, sentence): """ 给定一条句子,预测结果 :return: """ sentence_ids = self.sentence_to_encode(sentence) prediction_ = self.model.infer(sentence_ids["encoder_inputs"]) prediction = self.sess.run(prediction_) print(prediction.shape) response = self.response(prediction) return response
class LanguagePeripheral(base_peripheral): def __init__(self, output_dim, vocab_size=10000, embed_dim=50, lang='en', embedding_preload=True, gpu_id=-1, dropout=0): super(LanguagePeripheral, self).__init__() self.gpu_id = gpu_id self.pad_char = vocab_size self.bpe_encoder = BPEmb(lang=lang, vs=vocab_size, dim=embed_dim, add_pad_emb=True) # Add an extra padding character self.embed_layer = nn.Embedding(vocab_size + 1, embed_dim, padding_idx=self.pad_char) if (embedding_preload == True): self.embed_layer.load_state_dict( {'weight': torch.tensor(self.bpe_encoder.emb.vectors)}) print("Loading pretrained word embeddings.") self.enc_dropout = nn.Dropout(dropout) self.output = nn.Linear(embed_dim, output_dim) def forward(self, tokens): pad_mask = tokens.eq(self.id_PAD) embeddings = self.embed_layer(tokens) embeddings = self.enc_dropout(embeddings) output = self.output(embeddings) return output.unsqueeze(2) def embed_sentences(self, sentences): # Generate the tokens using BPEmb tokens, pad_mask = self.tokenize_sentences(sentences) return self.forward(tokens), pad_mask def decode_tokens(self, tokens): if isinstance(tokens, torch.Tensor): tokens = tokens.cpu().numpy().astype(int).tolist() elif isinstance(tokens, np.ndarray): tokens = tokens.astype(int).tolist() #Filter out all tokens which have values larger than vocab_size and filter all elements after EOS filtered_tokens = [] for t in tokens: values = [] for i in t: if i == self.id_EOS: break elif i < self.id_PAD: values.append(i) filtered_tokens.append(values) #Remove all the padding characters in a list return self.bpe_encoder.decode_ids(filtered_tokens) def tokenize_sentences(self, sentences): tokens = self.bpe_encoder.encode_ids_with_bos_eos(sentences) # Pad the tokens with the pad_char max_len = 0 for t in tokens: max_len = max(max_len, len(t)) for i in range(len(tokens)): tok_len = len(tokens[i]) tokens[i].extend([self.pad_char] * (max_len - tok_len)) tokens = torch.tensor(np.array(tokens)) if self.gpu_id > -1: tokens = tokens.cuda(self.gpu_id) pad_mask = tokens.eq(self.id_PAD) return tokens, pad_mask @property def id_PAD(self): return self.pad_char @property def id_GO(self): return 1 @property def id_EOS(self): return 2
class Model(): ''' класс модели производит загрузку модели и расстановку пунктуации в предложении ''' def __init__(self, export_dir, vocab_size=5000, emb_dim=200, dict_punct=None): self.vocab_size = vocab_size self.emb_dim = emb_dim self.bpemb_ru = BPEmb(lang='ru', vs=vocab_size, dim=emb_dim) self.export_dir = export_dir self.predict_fn = predictor.from_saved_model(export_dir) if dict_punct is None: self.d = { 1: 4922, 2: 4921, 3: 4978, 4: 4985, 5: 4947, 6: 4963, 7: 4936 } else: self.d = dict_punct def parse_fn(self, line): ''' функция кодировки строки: line- строка ''' feature = np.array([self.bpemb_ru.encode_ids(line)]).astype(np.int32) return feature, np.array([len(feature[0])]) def to_capital_latter(self, sentence): '''фукция, переводящая прописные буквы в заглавные после точки''' tmp = '' flag = True for c in sentence: if flag and c != ' ': tmp += c.upper() flag = False else: tmp += c if c in '.?!': flag = True return tmp def predict(self, line): x, x_len = self.parse_fn(line) predict = self.predict_fn({'x': x, 'len': x_len}) a = [] for i in range(predict['lengths'][0]): a.append(predict['sequences'][0][i]) if predict['prediction'][0][i] != 0: a.append(self.d[predict['prediction'][0][i]]) return self.to_capital_latter(self.bpemb_ru.decode_ids(np.array(a)))
from bpemb import BPEmb import argparse import os from tqdm import tqdm if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--lang", type=str, default="en") parser.add_argument("--output-dir", type=str, default="data/embeddings") parser.add_argument("--vs", type=int, default=200000) args = parser.parse_args() bpe = BPEmb(lang=args.lang, vs=args.vs) with open( os.path.join(args.output_dir, "bpe_{}_{}.txt".format(args.lang, args.vs)), "w") as f: for i in tqdm(range(bpe.vectors.shape[0])): w = bpe.decode_ids([i]) w = w.replace(" ", "") vec = bpe.vectors[i] f.write( w + " " + " ".join([str(vec[j]) for j in range(bpe.vectors.shape[1])]) + "\n")