コード例 #1
0
    def fit(self, x_train, y_train, x_dev, y_dev):
        """
            训练
        :param x_train: 
        :param y_train: 
        :param x_dev: 
        :param y_dev: 
        :return: 
        """
        # 保存超参数
        self.hyper_parameters["model"]["is_training"] = False  # 预测时候这些设为False
        self.hyper_parameters["model"]["trainable"] = False
        self.hyper_parameters["model"]["dropout"] = 1.0

        save_json(json_lines=self.hyper_parameters,
                  json_path=self.path_hyper_parameters)
        # 训练模型
        self.model.fit(x_train,
                       y_train,
                       batch_size=self.batch_size,
                       epochs=self.epochs,
                       validation_data=(x_dev, y_dev),
                       shuffle=True,
                       callbacks=self.callback())
        # 保存embedding, 动态的
        if self.trainable:
            self.word_embedding.model.save(self.path_fineture)
コード例 #2
0
    def fit_generator(self, embed, rate=1, encoding="utf-8"):
        """

        :param data_fit_generator: yield, 训练数据
        :param data_dev_generator: yield, 验证数据
        :param steps_per_epoch: int, 训练一轮步数
        :param validation_steps: int, 验证一轮步数
        :return: 
        """
        # 保存超参数
        self.hyper_parameters["model"]["is_training"] = False  # 预测时候这些设为False
        self.hyper_parameters["model"]["trainable"] = False
        self.hyper_parameters["model"]["dropout"] = 0.0

        save_json(json_lines=self.hyper_parameters,
                  json_path=self.path_hyper_parameters)

        pg = PreprocessGenerator(self.path_model_l2i_i2l)
        _, len_train = pg.preprocess_label2set(
            self.hyper_parameters["data"]["train_data"],
            self.embedding_type,
            encoding=encoding)
        data_fit_generator = pg.preprocess_label_question_to_idx_fit_generator(
            embedding_type=self.hyper_parameters["embedding_type"],
            crf_mode=self.hyper_parameters["model"]["crf_mode"],
            path=self.hyper_parameters["data"]["train_data"],
            batch_size=self.batch_size,
            embed=embed,
            rate=rate,
            epochs=self.epochs,
            encoding=encoding)
        _, len_val = pg.preprocess_label2set(
            self.hyper_parameters["data"]["val_data"],
            self.embedding_type,
            encoding=encoding)
        data_dev_generator = pg.preprocess_label_question_to_idx_fit_generator(
            embedding_type=self.hyper_parameters["embedding_type"],
            crf_mode=self.hyper_parameters["model"]["crf_mode"],
            path=self.hyper_parameters["data"]["val_data"],
            batch_size=self.batch_size,
            embed=embed,
            rate=rate,
            epochs=self.epochs,
            encoding=encoding)
        steps_per_epoch = (int(len_train * rate) if len_train > 500 else
                           len_train) // self.batch_size + 1
        validation_steps = (int(len_val * rate) if len_val > 500 else
                            len_val) // self.batch_size + 1
        # 训练模型
        self.model.fit_generator(generator=data_fit_generator,
                                 validation_data=data_dev_generator,
                                 callbacks=self.callback(),
                                 epochs=self.epochs,
                                 steps_per_epoch=steps_per_epoch,
                                 validation_steps=validation_steps,
                                 shuffle=True)
        # 保存embedding, 动态的
        if self.trainable:
            self.word_embedding.model.save(self.path_fineture)
コード例 #3
0
 def build(self, **kwargs):
     self.vocab_size = len(self.token2idx)
     self.input = tf.keras.layers.Input(shape=(self.len_max,), dtype='int32')
     self.output = tf.keras.layers.Embedding(self.vocab_size,
                             self.embed_size,
                             input_length=self.len_max,
                             trainable=self.trainable,
                             )(self.input)
     self.model = tf.keras.Model(self.input, self.output)
     save_json(json_lines=self.token2idx, json_path=os.path.join(self.path_model_dir, 'vocab.txt'))
コード例 #4
0
ファイル: seg_basic.py プロジェクト: seeker1943/Macropodus
 def save_delete_words(self, words):
     """
         删除词语到用户词典, 可持久化, 重载有效
     :param word_freqs: list, like ['大漠帝国']
     :return: None
     """
     assert type(words) == list
     for w in words:
         self.delete_word(w)  # 删除到总字典, 不持久化
         if w in self.dict_user: self.dict_user.pop(w)  # 删除到用户字典, 持久化
     save_json([self.dict_user], path_dict_user)
コード例 #5
0
ファイル: seg_basic.py プロジェクト: seeker1943/Macropodus
 def save_add_words(self, word_freqs):
     """
         新增词语到用户词典, 可持久化, 重载有效
     :param word_freqs: dict, like {'大漠帝国':132}
     :return: None
     """
     assert type(word_freqs) == dict
     for k, v in word_freqs.items():
         self.add_word(k, v)  # 新增到总字典, 不持久化
         self.dict_user[k] = v  # 新增到用户字典, 持久化
     save_json([self.dict_user], path_dict_user)
コード例 #6
0
ファイル: embedding.py プロジェクト: zhiyuanding/Macropodus
 def build(self, **kwargs):
     self.vocab_size = len(self.token2idx)
     logger.info("vocab_size is {}".format(str(self.vocab_size)))
     self.input = tf.keras.layers.Input(shape=(self.len_max, ),
                                        dtype='int32',
                                        name="input")
     self.output = tf.keras.layers.Embedding(
         self.vocab_size + 1,
         self.embed_size,
         input_length=self.len_max,
         trainable=self.trainable,
         name="embedding_{}".format(str(self.embed_size)))(self.input)
     self.model = tf.keras.Model(self.input, self.output)
     save_json(json_lines=self.token2idx,
               json_path=os.path.join(self.path_model_dir, 'vocab.txt'))
コード例 #7
0
ファイル: embedding.py プロジェクト: zhiyuanding/Macropodus
    def build(self, **kwargs):
        self.embedding_type = 'word2vec'
        # logger.info("load word2vec start!")
        self.key_vector = KeyedVectors.load_word2vec_format(
            self.corpus_path, **kwargs)
        # logger.info("load word2vec end!")
        self.embed_size = self.key_vector.vector_size

        self.token2idx = self.ot_dict.copy()
        embedding_matrix = []
        # 首先加self.token2idx中的四个[PAD]、[UNK]、[BOS]、[EOS]
        embedding_matrix.append(np.zeros(self.embed_size))
        embedding_matrix.append(np.random.uniform(-0.5, 0.5, self.embed_size))
        embedding_matrix.append(np.random.uniform(-0.5, 0.5, self.embed_size))
        embedding_matrix.append(np.random.uniform(-0.5, 0.5, self.embed_size))

        for word in self.key_vector.index2entity:
            self.token2idx[word] = len(self.token2idx)
            embedding_matrix.append(self.key_vector[word])

        # self.token2idx = self.token2idx
        self.idx2token = {}
        for key, value in self.token2idx.items():
            self.idx2token[value] = key

        self.vocab_size = len(self.token2idx)
        logger.info("vocab_size is {}".format(str(self.vocab_size)))
        embedding_matrix = np.array(embedding_matrix)
        # self.input = Input(shape=(self.len_max,), dtype='int32')
        self.input = tf.keras.layers.Input(shape=(self.len_max, ),
                                           dtype='int32',
                                           name="input")

        self.output = tf.keras.layers.Embedding(
            self.vocab_size,
            self.embed_size,
            input_length=self.len_max,
            weights=[embedding_matrix],
            trainable=self.trainable,
            name="embedding_{}".format(str(self.embed_size)))(self.input)
        self.model = tf.keras.Model(self.input, self.output)
        # 保存字/词典
        save_json(json_lines=self.token2idx,
                  json_path=os.path.join(self.path_model_dir, 'vocab.txt'))
コード例 #8
0
    def preprocess_label_question_to_idx_fit(self,
                                             embedding_type,
                                             path,
                                             embed,
                                             rate=1,
                                             crf_mode='reg'):
        """
            fit用, 关键:对每一条数据操作,获取label和问句index              
        :param embedding_type: str, like 'albert'
        :param path: str, like 'train.json'
        :param embed: class, like embed
        :param rate: float, like 0.9
        :param crf_mode: str, like 'reg', 'pad'
        :return: np.array
        """
        # 首先获取label,set,即存在的具体类
        label_set, len_all = self.preprocess_label2set(path, embedding_type)
        # 获取label转index字典等, 如果label2index存在则不转换了, dev验证集合的时候用
        if not os.path.exists(self.path_model_l2i_i2l):
            count = 0
            label2index = {}
            index2label = {}
            for label_one in label_set:
                label2index[label_one] = count
                index2label[count] = label_one
                count = count + 1
            l2i_i2l = {}
            l2i_i2l['l2i'] = label2index
            l2i_i2l['i2l'] = index2label
            save_json(l2i_i2l, self.path_model_l2i_i2l)
        else:
            l2i_i2l = load_json(self.path_model_l2i_i2l)

        # 读取数据的比例
        len_ql = int(rate * len_all)
        if len_ql <= 500:  # sample时候不生效,使得语料足够训练
            len_ql = len_all

        def process_line(line, embed, l2i_i2l):
            """
                对每一条数据操作,获取label和问句index
            :param line: 
            :param embed: 
            :param l2i_i2l: 
            :return: 
            """
            # 对每一条数据操作,对question和label进行padding
            ques_label = json.loads(line.strip())
            label_org = ques_label["label"]
            label_index = [l2i_i2l["l2i"][lr] for lr in label_org]
            # len_sequence = len(label_index)
            que_embed = embed.sentence2idx("".join(ques_label["question"]))
            # label padding
            if embedding_type in ['bert', 'albert']:
                # padding label
                len_leave = embed.len_max - len(label_index) - 2
                if len_leave >= 0:
                    label_index_leave = [l2i_i2l["l2i"]["<CLS>"]] + [
                        li for li in label_index
                    ] + [l2i_i2l["l2i"]["<PAD>"] for _ in range(len_leave)
                         ] + [l2i_i2l["l2i"]["<SEP>"]]
                else:
                    label_index_leave = [
                        l2i_i2l["l2i"]["<CLS>"]
                    ] + label_index[0:embed.len_max -
                                    2] + [l2i_i2l["l2i"]["<SEP>"]]
            else:
                # padding label
                len_leave = embed.len_max - len(label_index)  # -2
                if len_leave >= 0:
                    label_index_leave = [li for li in label_index] + [
                        l2i_i2l["l2i"]["<PAD>"] for i in range(len_leave)
                    ]
                else:
                    label_index_leave = label_index[0:embed.len_max]
            # 转为one-hot
            label_res = to_categorical(label_index_leave,
                                       num_classes=len(l2i_i2l["l2i"]))
            return que_embed, label_res

        file_csv = open(path, "r", encoding="utf-8")
        cout_all_line = 0
        cnt = 0
        x, y = [], []
        for line in file_csv:
            # 跳出循环
            if len_ql < cout_all_line:
                break
            cout_all_line += 1
            if line.strip():
                # 一个json一个json处理
                # 备注:最好训练前先处理,使得ques长度小于等于len_max(word2vec), len_max-2(bert, albert)
                x_line, y_line = process_line(line, embed, l2i_i2l)
                x.append(x_line)
                y.append(y_line.tolist())
                cnt += 1

        # 通过两种方式处理: 1.嵌入类型(bert, word2vec, random), 2.条件随机场(CRF:'pad', 'reg')类型
        if embedding_type in ['bert', 'albert']:
            x_, y_ = np.array(x), np.array(y)
            x_1 = np.array([x[0] for x in x_])
            x_2 = np.array([x[1] for x in x_])
            x_3 = np.array([x[2] for x in x_])
            if crf_mode == 'pad':
                x_all = [x_1, x_2, x_3]
            elif crf_mode == 'reg':
                x_all = [x_1, x_2]
            else:
                x_all = [x_1, x_2]
        else:
            x_, y_ = np.array(x), np.array(y)
            x_1 = np.array([x[0] for x in x_])
            x_2 = np.array([x[1] for x in x_])
            if crf_mode == 'pad':
                x_all = [x_1, x_2]
            elif crf_mode == 'reg':
                x_all = x_1
            else:
                x_all = x_1
                # 使用fit的时候, return返回
        return x_all, y_