def clean_sub_word_sentence(word_ids: np.array, bpemb: BPEmb):
    # Extra padding token is remove in BPEmb
    word_ids = word_ids - 1
    try:
        index = list(word_ids).index(bpemb.EOS)
        words = bpemb.decode_ids(word_ids[:index])
    except ValueError:  # No EOS found in sequence
        words = bpemb.decode_ids(word_ids)

    return words
Example #2
0
class SubWordVocab(object):
    def __init__(self, size):
        self.encoder = BPEmb(lang='en', vs=size)

        assert self.sos_id == 1
        assert self.eos_id == 2

    def __len__(self):
        return self.encoder.vs

    @property
    def sos_id(self):
        return 1

    @property
    def eos_id(self):
        return self.encoder.EOS

    def encode(self, syms):
        return self.encoder.encode_ids(syms)

    def decode(self, ids):
        syms = self.encoder.decode_ids(ids)
        if isinstance(syms, list):
            return ''
        return syms
Example #3
0
def test_multi_language():
    text = ["This is Stratford", "Kitap okuyordu."]
    bpemb_multi = BPEmb(lang="multi", add_pad_emb=True)
    print(bpemb_multi.encode_ids_with_bos_eos(text))
    print(
        bpemb_multi.decode_ids([[1, 5496, 200, 23866, 3927, 2],
                                [1, 45350, 44934, 67191, 94777, 2]]))
Example #4
0
def test_decoding():
    # Although <pad> word is added, when decoding it can't handle. Therefore, remove padding before decoding.
    # Decoding removes start/end tokens.
    bpemb_en = BPEmb(lang="en", add_pad_emb=True)
    # ids = [1, 215, 80, 8526, 1221, 2]
    ids = [[1, 215, 80, 8526, 1221, 2], [1, 215, 80, 8526, 1221, 2]]
    # ids = [1, 215, 80, 8526, 1221, 2, 10000, 10000]
    # print(bpemb_en.vectors[10000])
    print(bpemb_en.decode_ids(ids))
Example #5
0
class BPembTokenizer(Tokenizer):
    def __init__(self, vocab_size=50000, emb_dim=300, lang='en'):
        super(BPembTokenizer, self).__init__()
        from bpemb import BPEmb
        self.bpemb_en = BPEmb(lang=lang, vs=vocab_size, dim=emb_dim)

    def get_embeddings(self):
        return self.bpemb_en.vectors

    def encode_ids(self, text):
        return self.bpemb_en.encode_ids(text)

    def decode_ids(self, ids):
        return self.bpemb_en.decode_ids(ids)

    def tokenize(self, text):
        return self.bpemb_en.encode(text)
Example #6
0
class Predictor(PredictorBase):
    def __init__(self, config):
        super(Predictor, self).__init__(config)
        self.config = config
        self.model = None
        self.sess = None
        # self.builder = tf.saved_model.builder.SavedModelBuilder("savedModel")

        if self.config["use_bpe"]:
            self.bpe_zh = BPEmb(lang="zh", vs=config["vocab_size"])
        else:
            # 加载词汇表
            self.word_to_idx = self.load_vocab()
            self.idx_to_label = {value: key for key, value in self.word_to_idx.items()}

        # 初始化模型
        self.create_model()
        print("load model finished")
        # 加载计算图
        self.load_graph()
        print("load graph finished")

    def load_vocab(self):
        # 将词汇-索引映射表加载出来
        with open(os.path.join(self.output_path, "word_to_index.pkl"), "rb") as f:
            word_to_index = pickle.load(f)

        return word_to_index

    def sentence_to_encode(self, sentence):
        """
        创建数据对象
        :return:
        """
        if not sentence:
            return None

        if len(sentence) > 20:
            return None

        if self.config["use_bpe"]:
            word_idx = self.bpe_zh.encode_ids(sentence)
            word_idx = list(map(lambda x: x + 1, word_idx))
        else:
            word_idx = [self.word_to_idx.get(token, self.word_to_idx["UNK"]) for token in sentence]

        new_word_idx = self.process_data(word_idx)
        return new_word_idx

    @staticmethod
    def process_data(sentence):
        """
        对数据做预处理
        :param sentence:
        :return:
        """
        encoder_inputs = [sentence]
        return dict(encoder_inputs=encoder_inputs)

    def response(self, tokens_list):
        sents = []
        for i in range(self.config["beam_size"]):
            sent_token = tokens_list[:, i]
            if self.config["use_bpe"]:
                sent = self.bpe_zh.decode_ids(list(map(lambda x: x - 1, sent_token)))
            else:
                sent = "".join([self.idx_to_label[token] for token in sent_token])
            sents.append(sent)

        return sents

    def create_model(self):
        """
        根据config文件选择对应的模型,并初始化
        :return:
        """
        if self.config["model_name"] == "seq2seq_lstm":
            self.model = Seq2SeqTransformer(config=self.config, vocab_size=len(self.word_to_idx),
                                            word_vectors=None)

        if self.config["model_name"] == "seq2seq_bilstm":
            self.model = Seq2SeqBiLstmModel(config=self.config, vocab_size=len(self.word_to_idx),
                                            word_vectors=None)

    def load_graph(self):
        """
        加载计算图
        :return:
        """
        self.sess = tf.Session()
        ckpt = tf.train.get_checkpoint_state(os.path.join(os.path.abspath(os.path.dirname(os.getcwd())),
                                                          self.config["ckpt_model_path"]))
        if ckpt and tf.train.checkpoint_exists(ckpt.model_checkpoint_path):
            print('Reloading model parameters..')
            self.model.saver.restore(self.sess, ckpt.model_checkpoint_path)
        else:
            raise ValueError('No such file:[{}]'.format(self.config["ckpt_model_path"]))
        # inputs = {"inputs": tf.saved_model.utils.build_tensor_info(self.model.encoder_inputs),
        #           "inputs_length": tf.saved_model.utils.build_tensor_info(self.model.encoder_inputs_length),
        #           "keep_prob": tf.saved_model.utils.build_tensor_info(self.model.keep_prob)}
        #
        # outputs = {"predictions": tf.saved_model.utils.build_tensor_info(self.model.predictions)}
        #
        # prediction_signature = tf.saved_model.signature_def_utils.build_signature_def(inputs=inputs,
        #                                                                               outputs=outputs,
        #                                                                               method_name=tf.saved_model.signature_constants.PREDICT_METHOD_NAME)
        # legacy_init_op = tf.group(tf.tables_initializer(), name="legacy_init_op")
        # self.builder.add_meta_graph_and_variables(self.sess, [tf.saved_model.tag_constants.SERVING],
        #                                           signature_def_map={"dialogue": prediction_signature},
        #                                           legacy_init_op=legacy_init_op)

        # self.builder.save()

    def predict(self, sentence):
        """
         给定一条句子,预测结果
        :return:
        """
        sentence_ids = self.sentence_to_encode(sentence)
        prediction_ = self.model.infer(sentence_ids["encoder_inputs"])
        prediction = self.sess.run(prediction_)
        print(prediction.shape)
        response = self.response(prediction)
        return response
Example #7
0
class LanguagePeripheral(base_peripheral):
    def __init__(self,
                 output_dim,
                 vocab_size=10000,
                 embed_dim=50,
                 lang='en',
                 embedding_preload=True,
                 gpu_id=-1,
                 dropout=0):
        super(LanguagePeripheral, self).__init__()
        self.gpu_id = gpu_id
        self.pad_char = vocab_size
        self.bpe_encoder = BPEmb(lang=lang,
                                 vs=vocab_size,
                                 dim=embed_dim,
                                 add_pad_emb=True)
        # Add an extra padding character
        self.embed_layer = nn.Embedding(vocab_size + 1,
                                        embed_dim,
                                        padding_idx=self.pad_char)
        if (embedding_preload == True):
            self.embed_layer.load_state_dict(
                {'weight': torch.tensor(self.bpe_encoder.emb.vectors)})
            print("Loading pretrained word embeddings.")
        self.enc_dropout = nn.Dropout(dropout)
        self.output = nn.Linear(embed_dim, output_dim)

    def forward(self, tokens):
        pad_mask = tokens.eq(self.id_PAD)
        embeddings = self.embed_layer(tokens)
        embeddings = self.enc_dropout(embeddings)
        output = self.output(embeddings)
        return output.unsqueeze(2)

    def embed_sentences(self, sentences):
        # Generate the tokens using BPEmb
        tokens, pad_mask = self.tokenize_sentences(sentences)
        return self.forward(tokens), pad_mask

    def decode_tokens(self, tokens):
        if isinstance(tokens, torch.Tensor):
            tokens = tokens.cpu().numpy().astype(int).tolist()
        elif isinstance(tokens, np.ndarray):
            tokens = tokens.astype(int).tolist()
        #Filter out all tokens which have values larger than vocab_size and filter all elements after EOS
        filtered_tokens = []
        for t in tokens:
            values = []
            for i in t:
                if i == self.id_EOS:
                    break
                elif i < self.id_PAD:
                    values.append(i)
            filtered_tokens.append(values)
        #Remove all the padding characters in a list
        return self.bpe_encoder.decode_ids(filtered_tokens)

    def tokenize_sentences(self, sentences):
        tokens = self.bpe_encoder.encode_ids_with_bos_eos(sentences)
        # Pad the tokens with the pad_char
        max_len = 0

        for t in tokens:
            max_len = max(max_len, len(t))
        for i in range(len(tokens)):
            tok_len = len(tokens[i])
            tokens[i].extend([self.pad_char] * (max_len - tok_len))
        tokens = torch.tensor(np.array(tokens))
        if self.gpu_id > -1:
            tokens = tokens.cuda(self.gpu_id)
        pad_mask = tokens.eq(self.id_PAD)
        return tokens, pad_mask

    @property
    def id_PAD(self):
        return self.pad_char

    @property
    def id_GO(self):
        return 1

    @property
    def id_EOS(self):
        return 2
Example #8
0
class Model():
    '''
	класс модели
	производит загрузку модели и расстановку пунктуации в предложении    
	'''
    def __init__(self,
                 export_dir,
                 vocab_size=5000,
                 emb_dim=200,
                 dict_punct=None):
        self.vocab_size = vocab_size
        self.emb_dim = emb_dim

        self.bpemb_ru = BPEmb(lang='ru', vs=vocab_size, dim=emb_dim)

        self.export_dir = export_dir
        self.predict_fn = predictor.from_saved_model(export_dir)
        if dict_punct is None:
            self.d = {
                1: 4922,
                2: 4921,
                3: 4978,
                4: 4985,
                5: 4947,
                6: 4963,
                7: 4936
            }
        else:
            self.d = dict_punct

    def parse_fn(self, line):
        '''
		функция кодировки строки:
		line- строка
		'''
        feature = np.array([self.bpemb_ru.encode_ids(line)]).astype(np.int32)
        return feature, np.array([len(feature[0])])

    def to_capital_latter(self, sentence):
        '''фукция, переводящая прописные буквы в заглавные после точки'''
        tmp = ''
        flag = True
        for c in sentence:
            if flag and c != ' ':
                tmp += c.upper()
                flag = False
            else:
                tmp += c
            if c in '.?!':
                flag = True
        return tmp

    def predict(self, line):
        x, x_len = self.parse_fn(line)
        predict = self.predict_fn({'x': x, 'len': x_len})
        a = []
        for i in range(predict['lengths'][0]):
            a.append(predict['sequences'][0][i])
            if predict['prediction'][0][i] != 0:
                a.append(self.d[predict['prediction'][0][i]])
        return self.to_capital_latter(self.bpemb_ru.decode_ids(np.array(a)))
from bpemb import BPEmb
import argparse
import os
from tqdm import tqdm

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--lang", type=str, default="en")
    parser.add_argument("--output-dir", type=str, default="data/embeddings")
    parser.add_argument("--vs", type=int, default=200000)
    args = parser.parse_args()

    bpe = BPEmb(lang=args.lang, vs=args.vs)

    with open(
            os.path.join(args.output_dir,
                         "bpe_{}_{}.txt".format(args.lang, args.vs)),
            "w") as f:
        for i in tqdm(range(bpe.vectors.shape[0])):
            w = bpe.decode_ids([i])
            w = w.replace(" ", "")
            vec = bpe.vectors[i]
            f.write(
                w + " " +
                " ".join([str(vec[j])
                          for j in range(bpe.vectors.shape[1])]) + "\n")