コード例 #1
0
    def __init__(self,
                 maxLen,
                 ocrLen,
                 max_features,
                 init_embedding_matrix,
                 name='basicModel',
                 num_flods=4,
                 batch_size=64):
        """
        parameters initialize
        :param maxLen:
        :param max_features:
        :param init_embedding_matrix:
        """
        self.name = name
        self.ocrLen = ocrLen
        self.batch_size = batch_size
        self.maxLen = maxLen
        self.max_features = max_features
        self.embedding_matrix = init_embedding_matrix
        self.embed_size = len(init_embedding_matrix[0])

        self.num_folds = num_flods
        self.kf = KFold(n_splits=self.num_folds, shuffle=True, random_state=10)

        M = 3  # number of snapshots
        alpha_zero = 5e-4  # initial learning rate
        self.snap_epoch = 12
        self.snapshot = SnapshotCallbackBuilder(self.snap_epoch, M, alpha_zero)

        self.model = self.create_model()
コード例 #2
0
    def __init__(self, n_folds=5, name='BasicModel', config=None):
        if config is None:
            exit('请传入数值')
        self.name = name
        self.config = config
        self.n_class = config.n_class
        # char 特征
        self.char_max_len = config.CHAR_MAXLEN
        self.max_c_features = config.max_c_features
        # word 特征
        self.word_max_len = config.WORD_MAXLEN
        self.max_w_features = config.max_w_features
        self.char_mask_value = self.max_c_features - 2
        self.word_mask_value = self.max_w_features - 2
        self.batch_size = config.BATCH_SIZE

        self.char_embedding = config.char_init_embed
        self.word_embedding = config.word_init_embed
        self.char_embed_size = len(self.char_embedding[0])
        self.word_embed_size = len(self.word_embedding[0])
        self.n_folds = n_folds

        self.kf = KFold(n_splits=n_folds, shuffle=True, random_state=10)
        M = 3  # number of snapshots
        # alpha_zero = 5e-4  # initial learning rate
        # self.snap_epoch = NUM_EPOCHS
        # self.snapshot = SnapshotCallbackBuilder(self.snap_epoch, M, alpha_zero)
        self.last_val_acc = 0.

        self.init_lr = 0.001
        self.lr_schedule = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=1, min_lr=0.000001, verbose=1)

        #  if self.config.option == 6:
            #  self.init_lr = 1e-3
        #  elif self.config.option == 5:
            #  if 'attention' in self.config.model_name:
                #  self.wd = 0.001
            #  if 'textcnn' in self.config.model_name:
                #  self.init_lr = 0.001
                #  self.wd = 0.0015
            #  if 'capsule' in self.config.model_name:
                #  self.init_lr = 0.001
                #  self.wd = 0.003
            #  if 'lstmgru' in self.config.model_name:
                #  self.init_lr = 0.001
        #  elif self.config.option == 4:
            #  self.init_lr = 0.001
        #  elif self.config.option == 3:
            #  self.init_lr = 0.002
            #  # self.poly_decay = self.poly_decay_attention
        #  else:
            #  self.init_lr = 1e-3
        self.snapshot = SnapshotCallbackBuilder(NUM_EPOCHS, M, self.init_lr)
        self.early_stop_monitor = EarlyStopping(patience=5)
        print("[INFO] training with {} GPUs...".format(config.n_gpus))

        self.wd = config.wd
        self.model = self.create_model()
        if config.n_gpus > 1:
            self.model = multi_gpu_model(self.model, gpus=config.n_gpus)
コード例 #3
0
    def __init__(self, maxLen, ocrLen, max_features, init_embedding_matrix, name='basicModel', num_flods=4, batch_size=64):
        """
        parameters initialize
        :param maxLen:
        :param max_features:
        :param init_embedding_matrix:
        """
        self.name = name
        self.ocrLen = ocrLen
        self.batch_size = batch_size
        self.maxLen = maxLen
        self.max_features = max_features
        self.embedding_matrix = init_embedding_matrix
        self.embed_size = len(init_embedding_matrix[0])

        self.num_folds =  num_flods
        self.kf = KFold(n_splits=self.num_folds, shuffle=True, random_state=10)

        M = 3  # number of snapshots
        alpha_zero = 5e-4  # initial learning rate
        self.snap_epoch = 12
        self.snapshot = SnapshotCallbackBuilder(self.snap_epoch, M, alpha_zero)
        

        self.model = self.create_model()
コード例 #4
0
class BasicDeepModel(BasicModel):

    """Docstring for BasicModel. """

    def __init__(self, n_folds=5, name='BasicModel', config=None):
        if config is None:
            exit('请传入数值')
        self.name = name
        self.config = config
        self.n_class = config.n_class
        # char 特征
        self.char_max_len = config.CHAR_MAXLEN
        self.max_c_features = config.max_c_features
        # word 特征
        self.word_max_len = config.WORD_MAXLEN
        self.max_w_features = config.max_w_features
        self.char_mask_value = self.max_c_features - 2
        self.word_mask_value = self.max_w_features - 2
        self.batch_size = config.BATCH_SIZE

        self.char_embedding = config.char_init_embed
        self.word_embedding = config.word_init_embed
        self.char_embed_size = len(self.char_embedding[0])
        self.word_embed_size = len(self.word_embedding[0])
        self.n_folds = n_folds

        self.kf = KFold(n_splits=n_folds, shuffle=True, random_state=10)
        M = 3  # number of snapshots
        # alpha_zero = 5e-4  # initial learning rate
        # self.snap_epoch = NUM_EPOCHS
        # self.snapshot = SnapshotCallbackBuilder(self.snap_epoch, M, alpha_zero)
        self.last_val_acc = 0.

        self.init_lr = 0.001
        self.lr_schedule = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=1, min_lr=0.000001, verbose=1)

        #  if self.config.option == 6:
            #  self.init_lr = 1e-3
        #  elif self.config.option == 5:
            #  if 'attention' in self.config.model_name:
                #  self.wd = 0.001
            #  if 'textcnn' in self.config.model_name:
                #  self.init_lr = 0.001
                #  self.wd = 0.0015
            #  if 'capsule' in self.config.model_name:
                #  self.init_lr = 0.001
                #  self.wd = 0.003
            #  if 'lstmgru' in self.config.model_name:
                #  self.init_lr = 0.001
        #  elif self.config.option == 4:
            #  self.init_lr = 0.001
        #  elif self.config.option == 3:
            #  self.init_lr = 0.002
            #  # self.poly_decay = self.poly_decay_attention
        #  else:
            #  self.init_lr = 1e-3
        self.snapshot = SnapshotCallbackBuilder(NUM_EPOCHS, M, self.init_lr)
        self.early_stop_monitor = EarlyStopping(patience=5)
        print("[INFO] training with {} GPUs...".format(config.n_gpus))

        self.wd = config.wd
        self.model = self.create_model()
        if config.n_gpus > 1:
            self.model = multi_gpu_model(self.model, gpus=config.n_gpus)

    def poly_decay_attention(self, epoch):
        # initialize the maximum number of epochs, base learning rate,
        # and power of the polynomial

        if epoch < 5:
            print('epoch:{}, lr:{}, wd:{}'.format(1+epoch, self.init_lr, self.wd))
            return self.init_lr
        maxEpochs = NUM_EPOCHS
        baseLR = self.init_lr
        power = 1.0

        # compute the new learning rate based on polynomial decay
        alpha = baseLR * (1 - (epoch / (float(maxEpochs)))) ** power

        print('epoch:{}, lr:{}, wd:{}'.format(1+epoch, alpha, self.wd))

        # return the new learning rate
        return alpha

    def poly_decay(self, epoch):
        initial_lrate = self.init_lr
        drop = 0.5
        epochs_drop = 12
        lrate = initial_lrate * (drop ** ((1+epoch)//epochs_drop))
        print('epoch:{}, lr:{}, wd:{}'.format(1+epoch, lrate, self.wd))
        return lrate

    def plot_loss(self, H, fold):
        # grab the history object dictionary
        H = H.history

        # plot the training loss and accuracy
        N = np.arange(0, len(H["loss"]))
        plt.style.use("ggplot")
        plt.figure()
        plt.plot(N, H["loss"], label="train_loss")
        plt.plot(N, H["val_loss"], label="test_loss")
        plt.plot(N, H["acc"], label="train_acc")
        plt.plot(N, H["val_acc"], label="test_acc")
        plt.title("model {} option {}".format(self.name, self.config.option))
        plt.xlabel("Epoch #")
        plt.ylabel("Loss/Accuracy")
        plt.legend()

        # save the figure
        os.makedirs('loss', exist_ok=True)
        plt.savefig('loss/{}-op{}-fold{}.png'.format(self.name, self.config.option, fold))
        plt.close()

    def plot_loss_option3(self, H1, H2, fold):
        # grab the history object dictionary
        H1 = H1.history
        H2 = H2.history
        H = {}
        H['loss'] = H1['loss'] + H2['loss']
        H['val_loss'] = H1['val_loss'] + H2['val_loss']
        H['acc'] = H1['acc'] + H2['acc']
        H['val_acc'] = H1['val_acc'] + H2['val_acc']
        # plot the training loss and accuracy
        N = np.arange(0, len(H["loss"]))
        plt.style.use("ggplot")
        plt.figure()
        plt.plot(N, H["loss"], label="train_loss")
        plt.plot(N, H["val_loss"], label="test_loss")
        plt.plot(N, H["acc"], label="train_acc")
        plt.plot(N, H["val_acc"], label="test_acc")
        plt.title("model {} option {}".format(self.name, self.config.option))
        plt.xlabel("Epoch #")
        plt.ylabel("Loss/Accuracy")
        plt.legend()

        # save the figure
        os.makedirs('loss', exist_ok=True)
        plt.savefig('loss/{}-op{}-fold{}.png'.format(self.name, self.config.option, fold))
        plt.close()


    def train_predict(self, train, train_y, test, option=3):
        """
        we use KFold way to train our model and save the model
        :param train:
        :return:
        """
        name = self.name
        model_name = '../ckpt-op{}/{}'.format(self.config.option, self.name)
        os.makedirs(model_name, exist_ok=True)

        self.model.save_weights(model_name + '/init_weight.h5')

        count_kflod = 0
        predict = np.zeros((len(test['word']), self.n_class))
        oof_predict = np.zeros((len(train['word']), self.n_class))
        scores_acc = []
        scores_f1 = []
        for train_index, test_index in self.kf.split(train['word']):
            kfold_X_train = {}
            kfold_X_valid = {}
            model_prefix = model_name + '/' + str(count_kflod)
            if not os.path.exists(model_prefix):
                os.mkdir(model_prefix)
            filepath = model_prefix + '/' + str(count_kflod) + 'model.h5'
            checkpoint = ModelCheckpoint(filepath, monitor='val_loss', verbose=1, save_best_only=True, mode='min')

            y_train, y_test = train_y[train_index], train_y[test_index]

            self.model.load_weights(model_name + '/init_weight.h5')

            for c in ['word', 'char', 'word_left', 'word_right', 'char_left', 'char_right']:
                kfold_X_train[c] = train[c][train_index]
                kfold_X_valid[c] = train[c][test_index]

            if option == 1:
                # 冻结embedding, 并且使用snapshot的方式来训练模型
                adam_optimizer = optimizers.Adam(lr=1e-3, clipvalue=2.0)
                self.model.compile(loss='categorical_crossentropy', optimizer=adam_optimizer, metrics=['accuracy'])
                self.model.summary()
                self.model.fit(kfold_X_train, y_train,
                               batch_size=self.batch_size * self.config.n_gpus,
                               epochs=self.snap_epoch,
                               verbose=1,
                               validation_data=(kfold_X_valid, y_test),
                               callbacks=self.snapshot.get_callbacks(model_save_place=model_prefix))

            elif option == 2:
                # 前期冻结embedding层,训练好参数后,开放enbedding层并且使用snapshot的方式来训练模型
                adam_optimizer = optimizers.Adam(lr=1e-3, clipvalue=2)
                self.model.compile(loss='categorical_crossentropy', optimizer=adam_optimizer, metrics=['accuracy'])
                self.model.summary()
                H = self.model.fit(kfold_X_train, y_train,
                               batch_size=self.batch_size * self.config.n_gpus,
                               epochs=6,
                               verbose=1,
                               validation_data=(kfold_X_valid, y_test))

                if self.config.main_feature == 'all':
                    self.model.get_layer('char_embedding').trainable = True
                    self.model.get_layer('word_embedding').trainable = True
                elif self.config.main_feature == 'word':
                    self.model.get_layer('word_embedding').trainable = True
                elif self.config.main_feature == 'char':
                    self.model.get_layer('char_embedding').trainable = True
                else:
                    exit('Wrong feature')
                self.model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
                H = self.model.fit(kfold_X_train, y_train,
                               batch_size=self.batch_size * self.config.n_gpus,
                               epochs=self.snap_epoch,
                               verbose=1,
                               validation_data=(kfold_X_valid, y_test),
                               callbacks=self.snapshot.get_callbacks(model_save_place=model_prefix))

            elif option == 3:

                # 前期冻结embedding层,训练好参数后,开放enbedding层继续训练模型
                if self.config.main_feature == 'all':
                    self.model.get_layer('char_embedding').trainable = False
                    self.model.get_layer('word_embedding').trainable = False
                elif self.config.main_feature == 'word':
                    self.model.get_layer('word_embedding').trainable = False
                elif self.config.main_feature == 'char':
                    self.model.get_layer('char_embedding').trainable = False
                else:
                    exit('Wrong feature')

                # callbacks = [LearningRateScheduler(self.poly_decay)]
                adam_optimizer = optimizers.Adam(lr=1e-3, clipvalue=2.4)
                self.model.compile(loss='categorical_crossentropy', optimizer=adam_optimizer, metrics=['accuracy'])
                self.model.summary()
                H1 = self.model.fit(kfold_X_train, y_train,
                                    batch_size=self.batch_size * self.config.n_gpus,
                                    epochs=2,
                                    verbose=1,
                                    validation_data=(kfold_X_valid, y_test))

                if self.config.main_feature == 'all':
                    self.model.get_layer('char_embedding').trainable = True
                    self.model.get_layer('word_embedding').trainable = True
                elif self.config.main_feature == 'word':
                    self.model.get_layer('word_embedding').trainable = True
                elif self.config.main_feature == 'char':
                    self.model.get_layer('char_embedding').trainable = True
                else:
                    exit('Wrong feature')
                print('放开embedding训练')

                callbacks = [
                    self.lr_schedule,
                    checkpoint,
                ]
                adam_optimizer = optimizers.Adam(lr=1e-3, clipvalue=1.5)
                self.model.compile(loss='categorical_crossentropy', optimizer=adam_optimizer, metrics=['accuracy'])
                self.model.summary()

                H2 = self.model.fit(kfold_X_train, y_train,
                                    batch_size=self.batch_size * self.config.n_gpus,
                                    epochs=10,
                                    verbose=1,
                                    validation_data=(kfold_X_valid, y_test),
                                    callbacks=callbacks)

                # self.model.save_weights(model_prefix + '/' + str(count_kflod) + 'model.h5')
                self.plot_loss_option3(H1, H2, count_kflod)

            elif option == 4:

                if self.config.n_gpus == 1:
                    if self.config.main_feature == 'all':
                        self.model.get_layer('char_embedding').trainable = True
                        self.model.get_layer('word_embedding').trainable = True
                    elif self.config.main_feature == 'word':
                        self.model.get_layer('word_embedding').trainable = True
                    elif self.config.main_feature == 'char':
                        self.model.get_layer('char_embedding').trainable = True
                    else:
                        exit('Wrong feature')
                opt = optimizers.SGD(lr=self.init_lr, momentum=0.9, decay=1e-6)
                self.model.compile(loss="categorical_crossentropy", optimizer=opt, metrics=["accuracy"])
                self.model.summary()
                callbacks = [
                    LearningRateScheduler(self.poly_decay),
                    self.early_stop_monitor,
                ]

                H = self.model.fit(kfold_X_train, y_train,
                               batch_size=self.batch_size * self.config.n_gpus,
                               epochs=NUM_EPOCHS,
                               verbose=1,
                               validation_data=(kfold_X_valid, y_test),
                               callbacks=callbacks)
                self.plot_loss(H, count_kflod)
                self.model.save_weights(model_prefix + '/' + str(count_kflod) + 'model.h5')

            elif option == 5:
                # adam 目前最佳

                # if self.config.n_gpus == 1:
                    # if self.config.main_feature == 'all':
                        # self.model.get_layer('char_embedding').trainable = True
                        # self.model.get_layer('word_embedding').trainable = True
                    # elif self.config.main_feature == 'word':
                        # self.model.get_layer('word_embedding').trainable = True
                    # elif self.config.main_feature == 'char':
                        # self.model.get_layer('char_embedding').trainable = True
                    # else:
                        # exit('Wrong feature')

                #  if self.config.model_name == 'rnn_attention':
                    #  opt = optimizers.SGD(lr=0.2, decay=1e-6, momentum=0.95, nesterov=True)
                opt = optimizers.Adam(lr=1e-3, clipnorm=1.0)
                #  opt = optimizers.RMSprop(lr=0.001, rho=0.9, epsilon=None, decay=0.0)
                self.model.compile(loss="categorical_crossentropy", optimizer=opt, metrics=["accuracy"])
                #  self.model.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['acc'])
                self.model.summary()
                callbacks = [
                    checkpoint,
                    self.lr_schedule,
                ]

                H = self.model.fit(kfold_X_train, y_train,
                                batch_size=self.batch_size * self.config.n_gpus,
                                epochs=20,
                                verbose=1,
                                validation_data=(kfold_X_valid, y_test),
                                callbacks=callbacks)
                self.plot_loss(H, count_kflod)
                #  self.model.save_weights(model_prefix + '/' + str(count_kflod) + 'model.h5')

            elif option == 6:
                # snapshot + adam

                if self.config.n_gpus == 1:
                    if self.config.main_feature == 'all':
                        self.model.get_layer('char_embedding').trainable = True
                        self.model.get_layer('word_embedding').trainable = True
                    elif self.config.main_feature == 'word':
                        self.model.get_layer('word_embedding').trainable = True
                    elif self.config.main_feature == 'char':
                        self.model.get_layer('char_embedding').trainable = True
                    else:
                        exit('Wrong feature')
                opt = optimizers.Adam(lr=self.init_lr, decay=1e-6)
                self.model.compile(loss="categorical_crossentropy", optimizer=opt, metrics=["accuracy"])
                self.model.summary()
                H = self.model.fit(kfold_X_train, y_train,
                               batch_size=self.batch_size * self.config.n_gpus,
                               epochs=NUM_EPOCHS,
                               verbose=1,
                               validation_data=(kfold_X_valid, y_test),
                               callbacks=callbacks)

                self.plot_loss(H, count_kflod)
                #  self.model.save_weights(model_prefix + '/' + str(count_kflod) + 'model.h5')

            else:
                exit('Wrong option')

            evaluations = []
            for i in os.listdir(model_prefix):
                if '.h5' in i:
                    evaluations.append(i)
            print(evaluations)

            preds1 = np.zeros((test['word'].shape[0], self.n_class))
            preds2 = np.zeros((len(kfold_X_valid['word']), self.n_class))
            for run, i in enumerate(evaluations):
                self.model.load_weights(os.path.join(model_prefix, i))
                preds1 += self.model.predict(test, verbose=1) / len(evaluations)
                preds2 += self.model.predict(kfold_X_valid, batch_size=64*self.config.n_gpus) / len(evaluations)

                # model.save_weights('./ckpt/DNN_SNAP/' + str(count_kflod) + 'DNN.h5')

            # results = model.predict(test, verbose=1)

            predict += preds1 / self.n_folds
            oof_predict[test_index] = preds2

            accuracy = self.cal_acc(oof_predict[test_index], np.argmax(y_test, axis=1))
            f1 = self.cal_f_alpha(oof_predict[test_index], np.argmax(y_test, axis=1), n_out=self.n_class)

            print('the kflod cv acc is : ', str(accuracy))
            print('the kflod cv f1 is : ', str(f1))
            count_kflod += 1
            scores_acc.append(accuracy)
            scores_f1.append(f1)

        print('total acc scores is ', np.mean(scores_acc))
        print('total f1 scores is ', np.mean(scores_f1))

        os.makedirs('../data/result-op{}'.format(self.config.option), exist_ok=True)
        with open('../data/result-op{}/{}_oof_f1_{}_a{}.pkl'.format(self.config.option, name, str(np.mean(scores_f1)), str(np.mean(scores_acc))), 'wb') as f:
            pickle.dump(oof_predict, f)

        with open('../data/result-op{}/{}_pre_f1_{}_a{}.pkl'.format(self.config.option, name, str(np.mean(scores_f1)), str(np.mean(scores_acc))), 'wb') as f:
            pickle.dump(predict, f)

        print('done')

    def rerun(self, test):
        name = self.name
        evaluations = []
        for i in range(4):
            evaluations.append('../ckpt/{}/{}/{}model.h5'.format(name, i, i))

        predict = np.zeros((len(test), self.n_class))
        preds1 = np.zeros((test.shape[0], self.n_class))

        for run, i in enumerate(evaluations):
            self.model.load_weights(i)
            preds1 += self.model.predict(test, verbose=1) / len(evaluations)

        predict += preds1 / 4

        with open('../data/result/' + name + '_pre_.pkl', 'wb') as f:
            pickle.dump(predict, f)
コード例 #5
0
def stacking_pseudo(train, train_y, test, results):
    answer = np.zeros((results.shape[0], 1))
    for count in range(len(results)):
        answer[count] = np.argmax(results[count])
    answer = np_utils.to_categorical(answer)
    train_y = np.concatenate([train_y, answer], axis=0)
    train['news'] = np.concatenate([train['news'], test['news']], axis=0)


    savepath = './pesudo_/'
    if not os.path.exists(savepath):
        os.mkdir(savepath)
    count_kflod = 0
    num_folds = 6
    kf = KFold(n_splits=num_folds, shuffle=True, random_state=10)
    predict = np.zeros((test['news'].shape[0], 3))
    oof_predict = np.zeros((train['news'].shape[0], 3))
    scores = []
    for train_index, test_index in kf.split(train['news']):

        kfold_X_train = {}
        kfold_X_valid = {}

        y_train, y_test = train_y[train_index], train_y[test_index]

        for c in ['news']:
            kfold_X_train[c] = train[c][train_index]
            kfold_X_valid[c] = train[c][test_index]

        test_watch = []
        test_label = []
        for i in test_index:
            if i < 48480:
                test_watch.append(train[i])
                test_label.append(train_y[i])
        test_watch = np.array(test_watch)
        test_label = np.array(test_label)

        model_prefix = savepath + 'DNN' + str(count_kflod)
        if not os.path.exists(model_prefix):
            os.mkdir(model_prefix)

        M = 4  # number of snapshots
        alpha_zero = 1e-3  # initial learning rate
        snap_epoch = 16
        snapshot = SnapshotCallbackBuilder(snap_epoch, M, alpha_zero)

        res_model = get_model(train['news'])
        res_model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

        # res_model.fit(train_x, train_y, batch_size=BATCH_SIZE, epochs=EPOCH, verbose=1,  class_weight=class_weight)
        res_model.fit(kfold_X_train, y_train, batch_size=BATCH_SIZE, epochs=snap_epoch, verbose=1,
                      validation_data=(test_watch, test_label),
                      callbacks=snapshot.get_callbacks(model_save_place=model_prefix))

        evaluations = []
        for i in os.listdir(model_prefix):
            if '.h5' in i:
                evaluations.append(i)
        print(evaluations)

        preds1 = np.zeros((test['news'].shape[0], 3))
        preds2 = np.zeros((len(kfold_X_valid['news']), 3))
        for run, i in enumerate(evaluations):
            res_model.load_weights(os.path.join(model_prefix, i))
            preds1 += res_model.predict(test, verbose=1) / len(evaluations)
            preds2 += res_model.predict(kfold_X_valid, batch_size=128) / len(evaluations)

        predict += preds1 / num_folds
        oof_predict[test_index] = preds2

        accuracy = check_accuracy(oof_predict[test_index], y_test, test_index)
        print('the kflod cv is : ', str(accuracy))
        count_kflod += 1
        scores.append(accuracy)
    print('total scores is ', np.mean(scores))
    return predict
コード例 #6
0
class BasicModel:
    '''
    basic class of all models
    '''

    def __init__(self, maxLen, ocrLen, max_features, init_embedding_matrix, name='basicModel', num_flods=4, batch_size=64):
        """
        parameters initialize
        :param maxLen:
        :param max_features:
        :param init_embedding_matrix:
        """
        self.name = name
        self.ocrLen = ocrLen
        self.batch_size = batch_size
        self.maxLen = maxLen
        self.max_features = max_features
        self.embedding_matrix = init_embedding_matrix
        self.embed_size = len(init_embedding_matrix[0])

        self.num_folds =  num_flods
        self.kf = KFold(n_splits=self.num_folds, shuffle=True, random_state=10)

        M = 3  # number of snapshots
        alpha_zero = 5e-4  # initial learning rate
        self.snap_epoch = 12
        self.snapshot = SnapshotCallbackBuilder(self.snap_epoch, M, alpha_zero)
        

        self.model = self.create_model()

    def create_model(self):
        pass



    def train_predict(self, train, train_y, test, option=3, true_length=48480):
        """
        we use KFold way to train our model and save the model
        :param train: 
        :return: 
        """
        name = self.name
        model_name = '../ckpt/' + name
        if not os.path.exists(model_name):
            os.mkdir(model_name)
        self.model.save_weights(model_name + '/init_weight.h5')

        count_kflod = 0
        predict = np.zeros((test['news'].shape[0], 3))
        oof_predict = np.zeros((train['news'].shape[0], 3))
        scores = []
        for train_index, test_index in self.kf.split(train['news']):
            
            kfold_X_train = {}
            kfold_X_valid = {}
            model_prefix = model_name + '/' + str(count_kflod)
            if not os.path.exists(model_prefix):
                os.mkdir(model_prefix)


            y_train, y_test = train_y[train_index], train_y[test_index]
            
            
            self.model.load_weights(model_name + '/init_weight.h5')
            
            for c in ['news', 'ocr']:
                kfold_X_train[c] = train[c][train_index]
                kfold_X_valid[c] = train[c][test_index]


            if option == 1:
                # 冻结embedding, 并且使用snapshot的方式来训练模型
                self.model.get_layer('embedding').trainable = False
                adam_optimizer = optimizers.Adam(lr=1e-3, clipvalue=2.0)
                self.model.compile(loss='categorical_crossentropy', optimizer=adam_optimizer, metrics=['accuracy'])
                self.model.summary()
                self.model.fit(kfold_X_train, y_train, batch_size=self.batch_size, epochs=self.snap_epoch, verbose=1,
                          validation_data=(kfold_X_valid, y_test),
                          callbacks=self.snapshot.get_callbacks(model_save_place=model_prefix))

            elif option == 2:
                # 前期冻结embedding层,训练好参数后,开放enbedding层并且使用snapshot的方式来训练模型
                self.model.get_layer('embedding').trainable = False
                adam_optimizer = optimizers.Adam(lr=1e-3, clipvalue=2)
                self.model.compile(loss='categorical_crossentropy', optimizer=adam_optimizer, metrics=['accuracy'])
                self.model.summary()
                self.model.fit(kfold_X_train, y_train, batch_size=self.batch_size, epochs=4, verbose=1,
                          validation_data=(kfold_X_valid, y_test))

                self.model.get_layer('embedding').trainable = True
                self.model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
                self.model.fit(kfold_X_train, y_train, batch_size=self.batch_size, epochs=self.snap_epoch, verbose=1,
                               validation_data=(kfold_X_valid, y_test),
                               callbacks=self.snapshot.get_callbacks(model_save_place=model_prefix))

            else:
                # 前期冻结embedding层,训练好参数后,开放enbedding层继续训练模型
                self.model.get_layer('embedding').trainable = False
                adam_optimizer = optimizers.Adam(lr=1e-3, clipvalue=2.4)
                self.model.compile(loss='categorical_crossentropy', optimizer=adam_optimizer, metrics=['accuracy'])
                self.model.summary()
                self.model.fit(kfold_X_train, y_train, batch_size=self.batch_size, epochs=6, verbose=1,
                               validation_data=(kfold_X_valid, y_test))
                adam_optimizer = optimizers.Adam(lr=1e-4, clipvalue=1.5)

                self.model.get_layer('embedding').trainable = True
                self.model.compile(loss='categorical_crossentropy', optimizer=adam_optimizer, metrics=['accuracy'])
                self.model.fit(kfold_X_train, y_train, batch_size=self.batch_size, epochs=5, verbose=1,
                               validation_data=(kfold_X_valid, y_test))


                self.model.save_weights(model_prefix + '/' + str(count_kflod) + 'model.h5')

            evaluations = []
            for i in os.listdir(model_prefix):
                if '.h5' in i:
                    evaluations.append(i)
            print(evaluations)

            preds1 = np.zeros((test['news'].shape[0], 3))
            preds2 = np.zeros((len(kfold_X_valid['news']), 3))
            for run, i in enumerate(evaluations):
                self.model.load_weights(os.path.join(model_prefix, i))
                preds1 += self.model.predict(test, verbose=1) / len(evaluations)
                preds2 += self.model.predict(kfold_X_valid, batch_size=128) / len(evaluations)

                # model.save_weights('./ckpt/DNN_SNAP/' + str(count_kflod) + 'DNN.h5')

            # results = model.predict(test, verbose=1)

            predict += preds1 / self.num_folds
            oof_predict[test_index] = preds2

            accuracy = self.check_accuracy(oof_predict[test_index], y_test, test_index, true_length)
            print('the kflod cv is : ', str(accuracy))
            count_kflod += 1
            scores.append(accuracy)

        print('total scores is ', np.mean(scores))

        with open('../data/result/' + name + '_oof_' + str(np.mean(scores)) + '.txt', 'wb') as f:
            pickle.dump(oof_predict, f)

        with open('../data/result/' + name + '_pre_' + str(np.mean(scores)) + '.txt', 'wb') as f:
            pickle.dump(predict, f)

        print('done')

    def check_accuracy(self, pred, label, test_index, true_length):
        right = 0
        total = 0
        for count, re in enumerate(pred):
            cc = test_index[count]
            if cc >= true_length:
                continue
            total += 1
            flag = np.argmax(re)
            if int(flag) == int(np.argmax(label[count])):
                right += 1
        return right / total
コード例 #7
0
class BasicModel:
    '''
    basic class of all models
    '''
    def __init__(self,
                 maxLen,
                 ocrLen,
                 max_features,
                 init_embedding_matrix,
                 name='basicModel',
                 num_flods=4,
                 batch_size=64):
        """
        parameters initialize
        :param maxLen:
        :param max_features:
        :param init_embedding_matrix:
        """
        self.name = name
        self.ocrLen = ocrLen
        self.batch_size = batch_size
        self.maxLen = maxLen
        self.max_features = max_features
        self.embedding_matrix = init_embedding_matrix
        self.embed_size = len(init_embedding_matrix[0])

        self.num_folds = num_flods
        self.kf = KFold(n_splits=self.num_folds, shuffle=True, random_state=10)

        M = 3  # number of snapshots
        alpha_zero = 5e-4  # initial learning rate
        self.snap_epoch = 12
        self.snapshot = SnapshotCallbackBuilder(self.snap_epoch, M, alpha_zero)

        self.model = self.create_model()

    def create_model(self):
        pass

    def train_predict(self, train, train_y, test, option=3, true_length=48480):
        """
        we use KFold way to train our model and save the model
        :param train: 
        :return: 
        """
        name = self.name
        model_name = '../ckpt/' + name
        if not os.path.exists(model_name):
            os.mkdir(model_name)
        self.model.save_weights(model_name + '/init_weight.h5')

        count_kflod = 0
        predict = np.zeros((test['news'].shape[0], 3))
        oof_predict = np.zeros((train['news'].shape[0], 3))
        scores = []
        for train_index, test_index in self.kf.split(train['news']):

            kfold_X_train = {}
            kfold_X_valid = {}
            model_prefix = model_name + '/' + str(count_kflod)
            if not os.path.exists(model_prefix):
                os.mkdir(model_prefix)

            y_train, y_test = train_y[train_index], train_y[test_index]

            self.model.load_weights(model_name + '/init_weight.h5')

            for c in ['news', 'ocr']:
                kfold_X_train[c] = train[c][train_index]
                kfold_X_valid[c] = train[c][test_index]

            if option == 1:
                # 冻结embedding, 并且使用snapshot的方式来训练模型
                self.model.get_layer('embedding').trainable = False
                adam_optimizer = optimizers.Adam(lr=1e-3, clipvalue=2.0)
                self.model.compile(loss='categorical_crossentropy',
                                   optimizer=adam_optimizer,
                                   metrics=['accuracy'])
                self.model.summary()
                self.model.fit(kfold_X_train,
                               y_train,
                               batch_size=self.batch_size,
                               epochs=self.snap_epoch,
                               verbose=1,
                               validation_data=(kfold_X_valid, y_test),
                               callbacks=self.snapshot.get_callbacks(
                                   model_save_place=model_prefix))

            elif option == 2:
                # 前期冻结embedding层,训练好参数后,开放enbedding层并且使用snapshot的方式来训练模型
                self.model.get_layer('embedding').trainable = False
                adam_optimizer = optimizers.Adam(lr=1e-3, clipvalue=2)
                self.model.compile(loss='categorical_crossentropy',
                                   optimizer=adam_optimizer,
                                   metrics=['accuracy'])
                self.model.summary()
                self.model.fit(kfold_X_train,
                               y_train,
                               batch_size=self.batch_size,
                               epochs=4,
                               verbose=1,
                               validation_data=(kfold_X_valid, y_test))

                self.model.get_layer('embedding').trainable = True
                self.model.compile(loss='categorical_crossentropy',
                                   optimizer='adam',
                                   metrics=['accuracy'])
                self.model.fit(kfold_X_train,
                               y_train,
                               batch_size=self.batch_size,
                               epochs=self.snap_epoch,
                               verbose=1,
                               validation_data=(kfold_X_valid, y_test),
                               callbacks=self.snapshot.get_callbacks(
                                   model_save_place=model_prefix))

            else:
                # 前期冻结embedding层,训练好参数后,开放enbedding层继续训练模型
                self.model.get_layer('embedding').trainable = False
                adam_optimizer = optimizers.Adam(lr=1e-3, clipvalue=2.4)
                self.model.compile(loss='categorical_crossentropy',
                                   optimizer=adam_optimizer,
                                   metrics=['accuracy'])
                self.model.summary()
                self.model.fit(kfold_X_train,
                               y_train,
                               batch_size=self.batch_size,
                               epochs=6,
                               verbose=1,
                               validation_data=(kfold_X_valid, y_test))
                adam_optimizer = optimizers.Adam(lr=1e-4, clipvalue=1.5)

                self.model.get_layer('embedding').trainable = True
                self.model.compile(loss='categorical_crossentropy',
                                   optimizer=adam_optimizer,
                                   metrics=['accuracy'])
                self.model.fit(kfold_X_train,
                               y_train,
                               batch_size=self.batch_size,
                               epochs=5,
                               verbose=1,
                               validation_data=(kfold_X_valid, y_test))

                self.model.save_weights(model_prefix + '/' + str(count_kflod) +
                                        'model.h5')

            evaluations = []
            for i in os.listdir(model_prefix):
                if '.h5' in i:
                    evaluations.append(i)
            print(evaluations)

            preds1 = np.zeros((test['news'].shape[0], 3))
            preds2 = np.zeros((len(kfold_X_valid['news']), 3))
            for run, i in enumerate(evaluations):
                self.model.load_weights(os.path.join(model_prefix, i))
                preds1 += self.model.predict(test,
                                             verbose=1) / len(evaluations)
                preds2 += self.model.predict(kfold_X_valid,
                                             batch_size=128) / len(evaluations)

                # model.save_weights('./ckpt/DNN_SNAP/' + str(count_kflod) + 'DNN.h5')

            # results = model.predict(test, verbose=1)

            predict += preds1 / self.num_folds
            oof_predict[test_index] = preds2

            accuracy = self.check_accuracy(oof_predict[test_index], y_test,
                                           test_index, true_length)
            print('the kflod cv is : ', str(accuracy))
            count_kflod += 1
            scores.append(accuracy)

        print('total scores is ', np.mean(scores))

        with open(
                '../data/result/' + name + '_oof_' + str(np.mean(scores)) +
                '.txt', 'wb') as f:
            pickle.dump(oof_predict, f)

        with open(
                '../data/result/' + name + '_pre_' + str(np.mean(scores)) +
                '.txt', 'wb') as f:
            pickle.dump(predict, f)

        print('done')

    def check_accuracy(self, pred, label, test_index, true_length):
        right = 0
        total = 0
        for count, re in enumerate(pred):
            cc = test_index[count]
            if cc >= true_length:
                continue
            total += 1
            flag = np.argmax(re)
            if int(flag) == int(np.argmax(label[count])):
                right += 1
        return right / total
コード例 #8
0
def stacking_pseudo(train, train_y, test, results):
    answer = np.argmax(results, axis=1)
    answer = np_utils.to_categorical(answer, num_classes=config.n_class)

    train_y = np.concatenate([train_y, answer], axis=0)
    train = np.concatenate([train, test], axis=0)

    savepath = './pesudo_{}/'.format(args.option)
    if not os.path.exists(savepath):
        os.mkdir(savepath)
    count_kflod = 0
    num_folds = 6
    kf = KFold(n_splits=num_folds, shuffle=True, random_state=10)
    predict = np.zeros((test.shape[0], config.n_class))
    oof_predict = np.zeros((train.shape[0], config.n_class))
    scores = []
    f1s = []
    for train_index, test_index in kf.split(train):

        kfold_X_train = {}
        kfold_X_valid = {}

        y_train, y_test = train_y[train_index], train_y[test_index]

        kfold_X_train, kfold_X_valid = train[train_index], train[test_index]

        model_prefix = savepath + 'DNN' + str(count_kflod)
        if not os.path.exists(model_prefix):
            os.mkdir(model_prefix)

        M = 4  # number of snapshots
        alpha_zero = 1e-3  # initial learning rate
        snap_epoch = 16
        snapshot = SnapshotCallbackBuilder(snap_epoch, M, alpha_zero)

        res_model = get_model(train)
        res_model.compile(loss='categorical_crossentropy',
                          optimizer='adam',
                          metrics=['accuracy'])

        # res_model.fit(train_x, train_y, batch_size=BATCH_SIZE, epochs=EPOCH, verbose=1,  class_weight=class_weight)
        res_model.fit(
            kfold_X_train,
            y_train,
            batch_size=BATCH_SIZE,
            epochs=snap_epoch,
            verbose=1,
            validation_data=(kfold_X_valid, y_test),
            callbacks=snapshot.get_callbacks(model_save_place=model_prefix))

        evaluations = []
        for i in os.listdir(model_prefix):
            if '.h5' in i:
                evaluations.append(i)
        print(evaluations)

        preds1 = np.zeros((test.shape[0], config.n_class))
        preds2 = np.zeros((len(kfold_X_valid), config.n_class))
        for run, i in enumerate(evaluations):
            res_model.load_weights(os.path.join(model_prefix, i))
            preds1 += res_model.predict(test, verbose=1) / len(evaluations)
            preds2 += res_model.predict(kfold_X_valid,
                                        batch_size=128) / len(evaluations)

        predict += preds1 / num_folds
        oof_predict[test_index] = preds2

        accuracy = mb.cal_acc(oof_predict[test_index], np.argmax(y_test,
                                                                 axis=1))
        f1 = mb.cal_f_alpha(oof_predict[test_index],
                            np.argmax(y_test, axis=1),
                            n_out=config.n_class)
        print('the kflod cv is : ', str(accuracy))
        print('the kflod f1 is : ', str(f1))
        count_kflod += 1
        scores.append(accuracy)
        f1s.append(f1)
    print('total scores is ', np.mean(scores))
    print('total f1 is ', np.mean(f1s))
    return predict
コード例 #9
0
def stacking_pseudo(train, train_y, test, results):
    answer = np.reshape(np.argmax(results, axis=-1), [-1])
    answer = np.reshape(np.eye(4)[answer], [-1, 10, 4])

    train_y = np.concatenate([train_y, answer], axis=0)
    train = np.concatenate([train, test], axis=0)

    savepath = './pesudo_{}_dt{}/'.format(args.option, args.data_type)
    if not os.path.exists(savepath):
        os.mkdir(savepath)
    count_kflod = 0
    num_folds = 5
    kf = KFold(n_splits=num_folds, shuffle=True, random_state=10)
    predict = np.zeros((test.shape[0], 10, 4))
    oof_predict = np.zeros((train.shape[0], 10, 4))
    scores = []

    for i, (train_index, test_index) in enumerate(kf.split(train)):
        print('第{}折'.format(i))

        kfold_X_train = {}
        kfold_X_valid = {}

        y_train, y_test = train_y[train_index], train_y[test_index]

        kfold_X_train, kfold_X_valid = train[train_index], train[test_index]

        model_prefix = savepath + 'DNN' + str(count_kflod)
        if not os.path.exists(model_prefix):
            os.mkdir(model_prefix)

        M = 3  # number of snapshots
        alpha_zero = 1e-3  # initial learning rate
        snap_epoch = 30

        snapshot = SnapshotCallbackBuilder(snap_epoch, M, alpha_zero)
        # M = 1  # number of snapshots
        # snap_epoch = 16
        # jz_schedule = JZTrainCategory(model_prefix, snap_epoch, M, save_weights_only=True,  monitor='val_loss', factor=0.7, patience=1)

        res_model = get_model(train)
        res_model.compile(loss='categorical_crossentropy',
                          optimizer='adam',
                          metrics=['accuracy'])
        res_model.summary()

        # res_model.fit(train_x, train_y, batch_size=BATCH_SIZE, epochs=EPOCH, verbose=1,  class_weight=class_weight)
        res_model.fit(
            kfold_X_train,
            y_train,
            batch_size=BATCH_SIZE,
            epochs=snap_epoch,
            verbose=1,
            validation_data=(kfold_X_valid, y_test),
            callbacks=snapshot.get_callbacks(model_save_place=model_prefix))

        evaluations = []
        for i in os.listdir(model_prefix):
            if '.h5' in i:
                evaluations.append(i)

        test_pred_ = np.zeros((test.shape[0], 10, 4))
        oof_pred_ = np.zeros((len(kfold_X_valid), 10, 4))
        for run, i in enumerate(evaluations):
            print('loading from {}'.format(os.path.join(model_prefix, i)))
            res_model.load_weights(os.path.join(model_prefix, i))
            test_pred_ += res_model.predict(test, verbose=1,
                                            batch_size=256) / len(evaluations)
            oof_pred_ += res_model.predict(kfold_X_valid,
                                           batch_size=256) / len(evaluations)

        predict += test_pred_ / num_folds
        oof_predict[test_index] = oof_pred_

        f1 = get_f1_score(np.argmax(oof_pred_, -1),
                          np.argmax(y_test, -1),
                          verbose=True)
        print(i, ' kflod cv f1 : ', str(f1))
        count_kflod += 1
        scores.append(f1)
    print('f1 {} -> {}'.format(scores, np.mean(scores)))
    return predict, np.mean(scores)