Ejemplo n.º 1
0
 def save_params(self):
     self.params['num_classes'] = self.num_classes
     self.params['p2s_dict'] = self.p2s_dict
     self.params['i2p_dict'] = self.i2p_dict
     self.params['p2o_dict'] = self.p2o_dict
     self.params['max_len'] = self.max_len
     save_json(jsons=self.params, json_path=self.params_path)
Ejemplo n.º 2
0
 def save_params(self):
     self.params['num_classes'] = self.num_classes
     self.params['labels'] = self.labels
     self.params['index2label'] = self.index2label
     self.params['label2index'] = self.label2index
     self.params['max_len'] = self.max_len
     save_json(jsons=self.params, json_path=self.params_path)
Ejemplo n.º 3
0
    def fit_generator(self):
        # 保存超参数
        self.parameters['model_env_parameters'][
            'is_training'] = False  # 预测时候这些设为False
        self.parameters['model_env_parameters']['trainable'] = False
        save_json(jsons=self.i2l, json_path=self.index2label_path)
        save_json(jsons=self.parameters, json_path=self.path_parameters)
        train_D = MyDataGenerator(self.train_data,
                                  self.l2i,
                                  self.tokenizer,
                                  self.categories,
                                  self.max_len,
                                  self.batch_size,
                                  shuffle=True)
        valid_D = MyDataGenerator(self.valid_data,
                                  self.l2i,
                                  self.tokenizer,
                                  self.categories,
                                  self.max_len,
                                  self.batch_size,
                                  shuffle=True)
        # test_D = DataGenerator(self.test_data, self.l2i,self.tokenizer, self.categories, self.max_len, self.batch_size,
        #                        shuffle=True)

        # 模型训练
        history = self.model.fit_generator(
            train_D.__iter__(),
            steps_per_epoch=len(train_D),
            epochs=self.epoch,
            validation_data=valid_D.__iter__(),
            validation_steps=len(valid_D),
            callbacks=self.callback(),
        )
        epoch = history.epoch[-1] + 1
        acc = history.history['acc'][-1]
        val_acc = history.history['val_acc'][-1]
        logger.info("model:{}  last_epoch:{}  train_acc{}  val_acc{}".format(
            self.model_code, epoch, acc, val_acc))
Ejemplo n.º 4
0
def fit_process(self, embedding_type, path, embed, rate=1, shuffle=True):
    data = pd.read_csv(path)
    ques = data['ques'].tolist()
    label = data['label'].tolist()
    ques = [str(q).upper() for q in ques]
    label = [str(l).upper() for l in label]
    if shuffle:
        ques = np.array(ques)
        label = np.array(label)
        indexs = [ids for ids in range(len(label))]
        random.shuffle(indexs)
        ques, label = ques[indexs].tolist(), label[indexs].tolist()
    # 如果label2index存在则不转换了
    if not os.path.exists(self.path_fast_text_model_l2i_i2l):
        label_set = set(label)
        count = 0
        label2index = {}
        index2label = {}
        for label_one in label_set:
            label2index[label_one] = count
            index2label[count] = label_one
            count = count + 1

        l2i_i2l = {}
        l2i_i2l['l2i'] = label2index
        l2i_i2l['i2l'] = index2label
        save_json(l2i_i2l, self.path_fast_text_model_l2i_i2l)
    else:
        l2i_i2l = load_json(self.path_fast_text_model_l2i_i2l)

    len_ql = int(rate * len(ques))
    if len_ql <= 500:  # sample时候不生效,使得语料足够训练
        len_ql = len(ques)

    x = []
    print("ques to index start!")
    ques_len_ql = ques[0:len_ql]
    for i in tqdm(range(len_ql)):
        que = ques_len_ql[i]
        que_embed = embed.sentence2idx(que)
        x.append(que_embed)  # [[], ]
    label_zo = []
    print("label to onehot start!")
    label_len_ql = label[0:len_ql]
    for j in tqdm(range(len_ql)):
        label_one = label_len_ql[j]
        label_zeros = [0] * len(l2i_i2l['l2i'])
        label_zeros[l2i_i2l['l2i'][label_one]] = 1
        label_zo.append(label_zeros)

    count = 0
    if embedding_type in ['bert', 'albert']:
        x_, y_ = np.array(x), np.array(label_zo)
        x_1 = np.array([x[0] for x in x_])
        x_2 = np.array([x[1] for x in x_])
        x_all = [x_1, x_2]
        return x_all, y_
    elif embedding_type == 'xlnet':
        count += 1
        if count == 1:
            x_0 = x[0]
            print(x[0][0][0])
        x_, y_ = x, np.array(label_zo)
        x_1 = np.array([x[0][0] for x in x_])
        x_2 = np.array([x[1][0] for x in x_])
        x_3 = np.array([x[2][0] for x in x_])
        if embed.trainable:
            x_4 = np.array([x[3][0] for x in x_])
            x_all = [x_1, x_2, x_3, x_4]
        else:
            x_all = [x_1, x_2, x_3]
        return x_all, y_
    else:
        x_, y_ = np.array(x), np.array(label_zo)
        return x_, y_
    def fit_generator(self):
        # 保存超参数
        self.parameters['model_env_parameters'][
            'is_training'] = False  # 预测时候这些设为False
        self.parameters['model_env_parameters']['trainable'] = False
        save_json(jsons=self.i2l, json_path=self.index2label_path)
        save_json(jsons=self.parameters, json_path=self.path_parameters)
        model_code = self.model_code

        # DataGenerator只是一种为了节约内存的数据方式
        class DataGenerator:
            def __init__(self,
                         data,
                         l2i,
                         tokenizer,
                         categories,
                         maxlen=128,
                         batch_size=32,
                         shuffle=True):
                self.data = data
                self.l2i = l2i
                self.batch_size = batch_size
                self.categories = categories
                self.maxlen = maxlen
                self.tokenizer = tokenizer
                self.shuffle = shuffle
                self.steps = len(self.data) // self.batch_size
                if len(self.data) % self.batch_size != 0:
                    self.steps += 1

            def __len__(self):
                return self.steps

            def __iter__(self):
                while True:
                    idxs = list(range(len(self.data)))
                    if self.shuffle:
                        np.random.shuffle(idxs)
                    X, Y = [], []
                    for i in idxs:
                        d = self.data[i]
                        text = d[1][:self.maxlen].replace(' ', '')
                        x = self.tokenizer.encode(
                            text, algo_code=model_code)  # token_ids
                        # print(text)
                        # print(x)
                        y = self.l2i.get(str(d[0]))
                        X.append(x)
                        Y.append(y)
                        if len(X) == self.batch_size or i == idxs[-1]:
                            X = seq_padding(X, 0, self.maxlen)
                            Y = np.array(to_categorical(Y, self.categories))
                            # print("*"*10,X.shape)
                            yield (X, Y)
                            X, Y = [], []

        train_D = MyDataGenerator(self.train_data,
                                  self.l2i,
                                  self.tokenizer,
                                  self.categories,
                                  self.max_len,
                                  self.batch_size,
                                  shuffle=True)
        valid_D = MyDataGenerator(self.valid_data,
                                  self.l2i,
                                  self.tokenizer,
                                  self.categories,
                                  self.max_len,
                                  self.batch_size,
                                  shuffle=True)
        # test_D = DataGenerator(self.test_data, self.l2i,self.tokenizer, self.categories, self.max_len, self.batch_size,
        #                        shuffle=True)

        # 模型训练
        history = self.model.fit_generator(
            train_D.__iter__(),
            steps_per_epoch=len(train_D),
            epochs=self.epoch,
            validation_data=valid_D.__iter__(),
            validation_steps=len(valid_D),
            callbacks=self.callback(),
        )
        epoch = history.epoch[-1] + 1
        acc = history.history['acc'][-1]
        val_acc = history.history['val_acc'][-1]
        logger.info("model:{}  last_epoch:{}  train_acc{}  val_acc{}".format(
            self.model_code, epoch, acc, val_acc))
Ejemplo n.º 6
0
 def save_params(self):
     self.params['max_len'] = self.max_len
     save_json(jsons=self.params, json_path=self.params_path)