Exemple #1
0
    # print(result_file_path)


    w2v_embedding_cnn = WordEmbeddingCNN(
        rand_seed=seed,
        verbose=1,
        optimizers=config['optimizers'],
        input_dim=init_weight.shape[0],
        word_embedding_dim=word_embedding_dim,
        input_length=sentence_padding_length,
        num_labels=len(label_to_index),
        l1_conv_filter_type= [[layer1, 2, 10, 'valid', [2, 1], 0.],
                              [layer1, 4, 10, 'valid', [2, 1], 0.],
                              [layer1, 6, 10, 'valid', [2, 1], 0.],
                              ],

        l2_conv_filter_type=  [
            [layer2, 3, 1, 'valid', [2, 1], 0.25]
        ],
        full_connected_layer_units=[[hidden1,0.5],[hidden2,0.5]],
        embedding_dropout_rate=0.,
        nb_epoch=30,
        earlyStoping_patience=config['earlyStoping_patience'],
        lr = config['lr'],
        batch_size = batch_size,
        embedding_weight_trainable = True,
        embedding_init_weight=init_weight,
    )
    print (w2v_embedding_cnn.embedding_layer_output.get_weights()[0][1])

    w2v_embedding_cnn.print_model_descibe()
    def cross_validation(
            train_data=None,
            test_data=None,
            cv_data=None,
            feature_type='word',
            input_length=None,
            num_filter_list=None,
            verbose=0,
            cv=3,
            batch_size=32,
            lr=1e-2,
            need_segmented=True,
            word2vec_model_file_path=None,
            num_labels=24,
            embedding_weight_trainable=False,
            # 获取中间层输出
            get_cnn_middle_layer_output=False,
            middle_layer_output_file=None,
            rand_weight=False,
            need_validation=True,
            include_train_data=True,
            vocabulary_including_test_set=True,
    ):
        """

        Parameters
        ----------
        train_data : array-like
            训练数据 (train_X, train_y))
        test_data : array-like
            测试数据
        cv_data : array-like
            k份验证数据
        input_length : int
            输入长度
        num_filter_list : array-like
            验证参数,number of filters
        middle_layer_output_file : str
            中间层输出到哪个文件
        get_cnn_middle_layer_output : bool
            是否获取中间层输出(#,False)
        num_labels: int
            标签
        batch_size : int
            batch size
        vocabulary_including_test_set: bool,default,True
            字典是否包括测试集
        include_train_data : bool
            是否包含训练数据一样验证
        need_validation: bool
            是否要验证
        embedding_weight_trainable : bool
            切换 CNN(static-w2v) 和 CNN(non-static-w2v)
        rand_weight : bool
            切换 CNN(rand) or CNN(static/non-static-w2v)
        feature_type : str
            特征类型
        verbose : int
            数值越大,输出越详细
        cv:int
            进行 cv 折验证
        need_segmented:bool
            是否需要分词
        word2vec_model_file_path

        Notes
        ----------
        - 为了提高效率,默认设置 update_dictionary = False ,以保证feature encoder的字典一致,避免重复构造字典
        - 同时设置 diff_train_val_feature_encoder=1 来保证训练集上和验证集上的feature encoder 不同,因为字典大小不同

        Examples
        ----------
        >>> train_x = ['你好', '测试句子', '我要买手机', '今天天气不错', '无聊']
        >>> train_y = [1, 2, 3, 2, 3]
        >>> test_x = ['你好', '不错哟']
        >>> test_y = [1, 2]
        >>> cv_x = [['你好', '无聊'], ['测试句子', '今天天气不错'], ['我要买手机']]
        >>> cv_y = [[1, 3], [2, 2], [3]]
        >>> WordEmbeddingCNNWithOneConv.cross_validation(
        >>>         train_data = (train_x,train_y),
        >>>         test_data=(test_x,test_y),
        >>>         input_length=8,
        >>>         num_filter_list=[5,50],
        >>>         verbose=1,
        >>>         word2vec_model_file_path = '/home/jdwang/PycharmProjects/nlp_util/data_processing_util/word2vec_util/vector/50dim/vector1000000_50dim.gem',
        >>>     )

        """
        print('=' * 80)
        print('feature_type: %s, need_segmented: %s, vocabulary_including_test_set: %s' % (feature_type,
                                                                                      need_segmented,
                                                                                      vocabulary_including_test_set))
        print('input_length: %d, num_labels: %d' % (input_length, num_labels))
        print('lr: %f, batch_size: %d, rand_weight: %s, embedding_weight_trainable: %s' % (lr,batch_size, rand_weight, embedding_weight_trainable))
        if not rand_weight:
            print('W2V model file_path: %s' % word2vec_model_file_path)
        print('=' * 80)

        from data_processing_util.cross_validation_util import transform_cv_data, get_k_fold_data, get_val_score

        # 1. 获取交叉验证的数据
        if cv_data is None:
            assert train_data is not None, 'cv_data和train_data必须至少提供一个!'
            cv_data = get_k_fold_data(
                k=cv,
                train_data=train_data,
                test_data=test_data,
                include_train_data=include_train_data,
            )

        # 2. 将数据进行特征编码转换
        feature_encoder = WordEmbeddingCNN.get_feature_encoder(
            need_segmented=need_segmented,
            input_length=input_length,
            verbose=1,
            feature_type=feature_type,
            padding_mode='center',
            # 设置字典保持一致
            update_dictionary=False,
            vocabulary_including_test_set=vocabulary_including_test_set,
        )

        cv_data = transform_cv_data(feature_encoder, cv_data, verbose=verbose, diff_train_val_feature_encoder=1)

        # 交叉验证
        for num_filter in num_filter_list:
            print('=' * 40)
            print('num_filter is %d.' % num_filter)
            _, _, middle_output_dev, middle_output_val = get_val_score(
                WordEmbeddingCNNWithOneConv,
                cv_data=cv_data[:],
                verbose=verbose,
                num_filter=num_filter,
                num_labels=num_labels,
                word2vec_model_file_path=word2vec_model_file_path,
                embedding_weight_trainable=embedding_weight_trainable,
                get_cnn_middle_layer_output=get_cnn_middle_layer_output,
                need_validation=need_validation,
                rand_weight=rand_weight,
                batch_size=batch_size,
                lr=lr,
            )

            if get_cnn_middle_layer_output:
                # 保存结果
                with open(middle_layer_output_file, 'w') as fout:
                    # 保存中间结果
                    pickle.dump(cv_data, fout)
                    pickle.dump(middle_output_dev, fout)
                    pickle.dump(middle_output_val, fout)
    def cross_validation(
        train_data=None,
        test_data=None,
        cv_data=None,
        feature_type='word',
        input_length=None,
        num_filter_list=None,
        verbose=0,
        cv=3,
        batch_size=32,
        lr=1e-2,
        need_segmented=True,
        word2vec_model_file_path=None,
        num_labels=24,
        embedding_weight_trainable=False,
        shuffle_data=True,
        rand_weight=False,
        need_validation=True,
        include_train_data=True,
        vocabulary_including_test_set=True,
        n_estimators_list=None,
    ):

        print('=' * 80)
        print(
            'feature_type:%s,need_segmented:%s,vocabulary_including_test_set:%s'
            % (feature_type, need_segmented, vocabulary_including_test_set))
        print('rand_weight:%s,embedding_weight_trainable:%s' %
              (rand_weight, embedding_weight_trainable))
        print('=' * 80)

        from data_processing_util.cross_validation_util import transform_cv_data, get_k_fold_data, get_val_score

        # 1. 获取交叉验证的数据
        if cv_data is None:
            assert train_data is not None, 'cv_data和train_data必须至少提供一个!'
            cv_data = get_k_fold_data(
                k=cv,
                train_data=train_data,
                test_data=test_data,
                include_train_data=include_train_data,
            )

        # 2. 将数据进行特征编码转换
        feature_encoder = WordEmbeddingCNN.get_feature_encoder(
            need_segmented=need_segmented,
            input_length=input_length,
            verbose=1,
            feature_type=feature_type,
            # 设置字典保持一致
            update_dictionary=False,
            vocabulary_including_test_set=vocabulary_including_test_set,
        )

        cv_data = transform_cv_data(feature_encoder,
                                    cv_data,
                                    verbose=verbose,
                                    diff_train_val_feature_encoder=1)

        # 交叉验证
        for num_filter in num_filter_list:
            for n_estimators in n_estimators_list:
                print('=' * 40)
                print('num_filter and n_estimators is %d,%d.' %
                      (num_filter, n_estimators))
                get_val_score(
                    RFAndRFAndWordEmbeddingCnnMerge,
                    num_filter=num_filter,
                    n_estimators=n_estimators,
                    cv_data=cv_data[:],
                    verbose=verbose,
                    num_labels=num_labels,
                    word2vec_model_file_path=word2vec_model_file_path,
                    embedding_weight_trainable=embedding_weight_trainable,
                    need_validation=need_validation,
                    rand_weight=rand_weight,
                    batch_size=batch_size,
                    lr=lr,
                    shuffle_data=shuffle_data,
                )
    def __init__(self, feature_encoder, num_filter, num_labels, n_estimators,
                 word2vec_model_file_path, **kwargs):

        if kwargs.get('rand_weight', False):
            # CNN(rand)模式
            weight = None
        elif kwargs['dataset_flag'] == 0:
            if RFAndWordEmbeddingCnnMerge.train_data_weight is None:
                # 训练集
                RFAndWordEmbeddingCnnMerge.train_data_weight = feature_encoder.to_embedding_weight(
                    word2vec_model_file_path)
            weight = RFAndWordEmbeddingCnnMerge.train_data_weight
        else:
            # kwargs['dataset_flag']>0
            if RFAndWordEmbeddingCnnMerge.val_data_weight is None:
                RFAndWordEmbeddingCnnMerge.val_data_weight = feature_encoder.to_embedding_weight(
                    word2vec_model_file_path)
            weight = RFAndWordEmbeddingCnnMerge.val_data_weight
        # print(weight)
        self.static_w2v_cnn = WordEmbeddingCNN(
            rand_seed=1377,
            verbose=kwargs.get('verbose', 0),
            feature_encoder=feature_encoder,
            # optimizers='adadelta',
            optimizers='sgd',
            # 当使用CNN (rand) 模式的时候使用到了
            word_embedding_dim=50,
            # 设置embedding使用训练好的w2v模型初始化
            embedding_init_weight=weight,
            # 默认设置为训练时embedding层权重不变
            embedding_weight_trainable=kwargs.get('embedding_weight_trainable',
                                                  False),
            num_labels=num_labels,
            l1_conv_filter_type=[
                [num_filter, 3, -1, 'valid', (-1, 1), 0.5, 'relu', 'none'],
                [num_filter, 4, -1, 'valid', (-1, 1), 0., 'relu', 'none'],
                [num_filter, 5, -1, 'valid', (-1, 1), 0., 'relu', 'none'],
            ],
            l2_conv_filter_type=[],
            full_connected_layer_units=[],
            embedding_dropout_rate=0.,
            nb_epoch=kwargs.get('nb_epoch', 25),
            batch_size=kwargs.get('batch_size', 32),
            earlyStoping_patience=30,
            lr=kwargs.get('lr', 1e-2),
            show_validate_accuracy=True
            if kwargs.get('verbose', 0) > 0 else False,
            # output_regularizer=('l2', 0.5),
            output_constraints=('maxnorm', 3),
            # 必须设为True,才能取中间结果做特征
            save_middle_output=True,
        )

        self.bow_randomforest = BowRandomForest(
            rand_seed=1377,
            verbose=kwargs.get('verbose', 0),
            feature_encoder=feature_encoder,
            # optimizers='adadelta',
            n_estimators=n_estimators,
            min_samples_leaf=1,
        )
    test_cnn_feature_file_path = test_cnn_feature_file_path % seed

    print model_file_path
    print result_file_path
    print train_cnn_feature_file_path
    print test_cnn_feature_file_path


    rand_embedding_cnn = WordEmbeddingCNN(
        rand_seed=seed,
        verbose=verbose,
        optimizers=config['optimizers'],
        input_dim=feature_encoder.vocabulary_size + 1,
        word_embedding_dim=config['word_embedding_dim'],
        input_length=config['sentence_padding_length'],
        num_labels=len(label_to_index),
        conv_filter_type=config['conv_filter_type'],
        k=config['kmax_k'],
        embedding_dropout_rate=config['embedding_dropout_rate'],
        output_dropout_rate=config['output_dropout_rate'],
        nb_epoch=int(config['cnn_nb_epoch']),
        earlyStoping_patience=config['earlyStoping_patience'],
    )
    rand_embedding_cnn.print_model_descibe()

    if config['refresh_all_model'] or not os.path.exists(model_file_path):
        # 训练模型
        rand_embedding_cnn.fit((train_X_feature, train_y),
                               (test_X_feature, test_y))
        # 保存模型
        rand_embedding_cnn.save_model(model_file_path)
Exemple #6
0
label_to_index,index_to_label = data_util.get_label_index(version=config['label_version'])

# ****************************************************************
# ------------- region end : 1. 加载训练数据和测试数据 -------------
# ****************************************************************

# ****************************************************************
# +++++++++++++ region start : 2. 转换数据的格式并特征编码 +++++++++++++
# ****************************************************************
logging.debug('=' * 20)
logging.debug('2. 转换数据的格式并特征编码')
from data_processing_util.cross_validation_util import transform_cv_data
from deep_learning.cnn.wordEmbedding_cnn.wordEmbedding_cnn_model import WordEmbeddingCNN

feature_encoder = WordEmbeddingCNN.get_feature_encoder(
    **{'input_length': input_length,
       'feature_type':feature_type,}
)

train_X_feature = feature_encoder.fit_transform(train_data=train_data['SENTENCE'].as_matrix())
feature_encoder.print_model_descibe()
feature_encoder.print_sentence_length_detail()

# train_y = train_data['LABEL_INDEX'].as_matrix()

test_all_X_feature = feature_encoder.transform(test_data['SENTENCE'].as_matrix())

test_all_y = test_data['LABEL_INDEX'].as_matrix()
print(train_data['LABEL_INDEX'].as_matrix())

print(train_X_feature.shape)
print(test_all_X_feature.shape)
Exemple #7
0
word2vec_file_path = config['word2vec_model_file_path']
word2vec_file_path = word2vec_file_path%config['word_embedding_dim']
print(model_file_path)
print(word2vec_file_path)
for seed in [10,100,500,1337,2000,300]:
    w2v_embedding_cnn = WordEmbeddingCNN(
        rand_seed=seed,
        verbose=verbose,
        input_dim=feature_encoder.vocabulary_size + 1,
        word_embedding_dim=config['word_embedding_dim'],
        embedding_init_weight=feature_encoder.to_embedding_weight(word2vec_file_path),
        input_length=config['padding_length'],
        num_labels=len(label_to_index),
        conv_filter_type=config['conv_filter_type'],
        k=config['kmax_k'],
        embedding_dropout_rate=config['embedding_dropout_rate'],
        output_dropout_rate=config['output_dropout_rate'],
        nb_epoch=int(config['cnn_nb_epoch']),
        earlyStoping_patience=config['earlyStoping_patience'],
        feature_encoder=feature_encoder.vocabulary_size+1,
        optimizers='sgd',
        lr= 1e-1,
        batch_size = 128,
    )
    w2v_embedding_cnn.print_model_descibe()

    if config['refresh_all_model'] or not os.path.exists(model_file_path):
        # 训练模型
        w2v_embedding_cnn.fit((train_w2v_features, train_y),
                               (test_w2v_features, test_y))
Exemple #8
0
    result_file_path = "".join([str(item) for item in config["result_file_path"]])
    result_file_path = result_file_path % seed

    print model_file_path
    print result_file_path
    print "rand seed:%d" % seed

    rand_embedding_cnn = WordEmbeddingCNN(
        rand_seed=seed,
        verbose=1,
        optimizers=config["optimizers"],
        input_dim=feature_encoder.vocabulary_size,
        word_embedding_dim=config["word_embedding_dim"],
        input_length=sentence_padding_length,
        num_labels=len(label_to_index),
        l1_conv_filter_type=config["l1_conv_filter_type"],
        l2_conv_filter_type=config["l2_conv_filter_type"],
        full_connected_layer_units=config["full_connected_layer_units"],
        embedding_dropout_rate=config["embedding_dropout_rate"],
        nb_epoch=int(config["cnn_nb_epoch"]),
        earlyStoping_patience=config["earlyStoping_patience"],
        lr=config["lr"],
        batch_size=config["batch_size"],
        embedding_weight_trainable=True,
    )

    rand_embedding_cnn.print_model_descibe()

    if config["refresh_all_model"] or not os.path.exists(model_file_path):

        print ("+" * 80)
        # 训练模型
Exemple #9
0
    def get_model(feature_encoder, num_filter, num_labels,
                  word2vec_model_file_path, **kwargs):
        # print(WordEmbeddingCNNWithOneConv.weight)
        """获取 CNN(w2v)模型

        Parameters
        ----------
        feature_encoder : FeatureEncoder
            特征编码器
        num_filter : int
        num_labels : int
        word2vec_model_file_path : str
        kwargs : dict
            - dataset_flag
            - rand_weight : (default,False)设置为 True 时,为 CNN(rand) 模型
            - verbose
            - embedding_weight_trainable

        Returns
        -------

        """
        if kwargs.get('rand_weight', False):
            # CNN(rand)模式
            weight = None
        elif kwargs['dataset_flag'] == 0:
            if WordEmbeddingCNNWithOneConv.train_data_weight is None:
                # 训练集
                WordEmbeddingCNNWithOneConv.train_data_weight = feature_encoder.to_embedding_weight(
                    word2vec_model_file_path)
            weight = WordEmbeddingCNNWithOneConv.train_data_weight
        else:
            # kwargs['dataset_flag']>0
            if WordEmbeddingCNNWithOneConv.val_data_weight is None:
                WordEmbeddingCNNWithOneConv.val_data_weight = feature_encoder.to_embedding_weight(
                    word2vec_model_file_path)
            weight = WordEmbeddingCNNWithOneConv.val_data_weight
        # print(weight)
        static_w2v_cnn = WordEmbeddingCNN(
            rand_seed=1377,
            verbose=kwargs.get('verbose', 0),
            feature_encoder=feature_encoder,
            # optimizers='adadelta',
            optimizers='sgd',
            # 当使用CNN (rand) 模式的时候使用到了
            word_embedding_dim=300,
            # 设置embedding使用训练好的w2v模型初始化
            embedding_init_weight=weight,
            # 默认设置为训练时embedding层权重不变
            embedding_weight_trainable=kwargs.get('embedding_weight_trainable',
                                                  False),
            num_labels=num_labels,
            l1_conv_filter_type=[
                [num_filter, 3, -1, 'valid', (-1, 1), 0.5, 'relu', 'none'],
                [num_filter, 4, -1, 'valid', (-1, 1), 0., 'relu', 'none'],
                [num_filter, 5, -1, 'valid', (-1, 1), 0., 'relu', 'none'],
            ],
            l2_conv_filter_type=[],
            full_connected_layer_units=[],
            embedding_dropout_rate=0.,
            nb_epoch=kwargs.get('nb_epoch', 25),
            batch_size=kwargs.get('batch_size', 32),
            earlyStoping_patience=30,
            lr=kwargs.get('lr', 1e-2),
            show_validate_accuracy=True
            if kwargs.get('verbose', 0) > 0 else False,
            # output_regularizer=('l2', 0.5),
            output_constraints=('maxnorm', 3),
            save_middle_output=kwargs.get('get_cnn_middle_layer_output',
                                          False),
        )
        # static_w2v_cnn.print_model_descibe()
        # quit()
        return static_w2v_cnn
Exemple #10
0
    word2vec_file_path = (config['word2vec_file_path'])%config['word_embedding_dim']

    print model_file_path
    print result_file_path
    print train_cnn_feature_file_path
    print test_cnn_feature_file_path
    print word2vec_file_path

    rand_embedding_cnn = WordEmbeddingCNN(
        rand_seed=seed,
        verbose=verbose,
        input_dim=feature_encoder.vocabulary_size + 1,
        word_embedding_dim=config['word_embedding_dim'],
        embedding_init_weight=feature_encoder.to_embedding_weight(word2vec_file_path),
        input_length=config['sentence_padding_length'],
        num_labels=len(label_to_index),
        conv_filter_type=config['conv_filter_type'],
        k=config['kmax_k'],
        embedding_dropout_rate=config['embedding_dropout_rate'],
        output_dropout_rate=config['output_dropout_rate'],
        nb_epoch=int(config['cnn_nb_epoch']),
        earlyStoping_patience=config['earlyStoping_patience'],
    )
    rand_embedding_cnn.print_model_descibe()

    if config['refresh_all_model'] or not os.path.exists(model_file_path):
        # 训练模型
        rand_embedding_cnn.fit((feature_encoder.train_padding_index, train_y),
                               (map(feature_encoder.transform_sentence, test_X), test_y))
        # 保存模型
        rand_embedding_cnn.save_model(model_file_path)
Exemple #11
0
    def cross_validation(
        train_data=None,
        test_data=None,
        cv_data=None,
        feature_type='word',
        input_length=None,
        num_filter_list=None,
        verbose=0,
        cv=3,
        batch_size=32,
        lr=1e-2,
        need_segmented=True,
        word2vec_model_file_path=None,
        num_labels=24,
        embedding_weight_trainable=False,
        # 获取中间层输出
        get_cnn_middle_layer_output=False,
        middle_layer_output_file=None,
        rand_weight=False,
        need_validation=True,
        include_train_data=True,
        vocabulary_including_test_set=True,
    ):
        """

        Parameters
        ----------
        train_data : array-like
            训练数据 (train_X, train_y))
        test_data : array-like
            测试数据
        cv_data : array-like
            k份验证数据
        input_length : int
            输入长度
        num_filter_list : array-like
            验证参数,number of filters
        middle_layer_output_file : str
            中间层输出到哪个文件
        get_cnn_middle_layer_output : bool
            是否获取中间层输出(#,False)
        num_labels: int
            标签
        batch_size : int
            batch size
        vocabulary_including_test_set: bool,default,True
            字典是否包括测试集
        include_train_data : bool
            是否包含训练数据一样验证
        need_validation: bool
            是否要验证
        embedding_weight_trainable : bool
            切换 CNN(static-w2v) 和 CNN(non-static-w2v)
        rand_weight : bool
            切换 CNN(rand) or CNN(static/non-static-w2v)
        feature_type : str
            特征类型
        verbose : int
            数值越大,输出越详细
        cv:int
            进行 cv 折验证
        need_segmented:bool
            是否需要分词
        word2vec_model_file_path

        Notes
        ----------
        - 为了提高效率,默认设置 update_dictionary = False ,以保证feature encoder的字典一致,避免重复构造字典
        - 同时设置 diff_train_val_feature_encoder=1 来保证训练集上和验证集上的feature encoder 不同,因为字典大小不同

        Examples
        ----------
        >>> train_x = ['你好', '测试句子', '我要买手机', '今天天气不错', '无聊']
        >>> train_y = [1, 2, 3, 2, 3]
        >>> test_x = ['你好', '不错哟']
        >>> test_y = [1, 2]
        >>> cv_x = [['你好', '无聊'], ['测试句子', '今天天气不错'], ['我要买手机']]
        >>> cv_y = [[1, 3], [2, 2], [3]]
        >>> WordEmbeddingCNNWithOneConv.cross_validation(
        >>>         train_data = (train_x,train_y),
        >>>         test_data=(test_x,test_y),
        >>>         input_length=8,
        >>>         num_filter_list=[5,50],
        >>>         verbose=1,
        >>>         word2vec_model_file_path = '/home/jdwang/PycharmProjects/nlp_util/data_processing_util/word2vec_util/vector/50dim/vector1000000_50dim.gem',
        >>>     )

        """
        print('=' * 80)
        print(
            'feature_type: %s, need_segmented: %s, vocabulary_including_test_set: %s'
            % (feature_type, need_segmented, vocabulary_including_test_set))
        print('input_length: %d, num_labels: %d' % (input_length, num_labels))
        print(
            'lr: %f, batch_size: %d, rand_weight: %s, embedding_weight_trainable: %s'
            % (lr, batch_size, rand_weight, embedding_weight_trainable))
        if not rand_weight:
            print('W2V model file_path: %s' % word2vec_model_file_path)
        print('=' * 80)

        from data_processing_util.cross_validation_util import transform_cv_data, get_k_fold_data, get_val_score

        # 1. 获取交叉验证的数据
        if cv_data is None:
            assert train_data is not None, 'cv_data和train_data必须至少提供一个!'
            cv_data = get_k_fold_data(
                k=cv,
                train_data=train_data,
                test_data=test_data,
                include_train_data=include_train_data,
            )

        # 2. 将数据进行特征编码转换
        feature_encoder = WordEmbeddingCNN.get_feature_encoder(
            need_segmented=need_segmented,
            input_length=input_length,
            verbose=1,
            feature_type=feature_type,
            padding_mode='center',
            # 设置字典保持一致
            update_dictionary=False,
            vocabulary_including_test_set=vocabulary_including_test_set,
        )

        cv_data = transform_cv_data(feature_encoder,
                                    cv_data,
                                    verbose=verbose,
                                    diff_train_val_feature_encoder=1)

        # 交叉验证
        for num_filter in num_filter_list:
            print('=' * 40)
            print('num_filter is %d.' % num_filter)
            _, _, middle_output_dev, middle_output_val = get_val_score(
                WordEmbeddingCNNWithOneConv,
                cv_data=cv_data[:],
                verbose=verbose,
                num_filter=num_filter,
                num_labels=num_labels,
                word2vec_model_file_path=word2vec_model_file_path,
                embedding_weight_trainable=embedding_weight_trainable,
                get_cnn_middle_layer_output=get_cnn_middle_layer_output,
                need_validation=need_validation,
                rand_weight=rand_weight,
                batch_size=batch_size,
                lr=lr,
            )

            if get_cnn_middle_layer_output:
                # 保存结果
                with open(middle_layer_output_file, 'w') as fout:
                    # 保存中间结果
                    pickle.dump(cv_data, fout)
                    pickle.dump(middle_output_dev, fout)
                    pickle.dump(middle_output_val, fout)
Exemple #12
0
class RFAndWordEmbeddingCnnMerge(CnnBaseClass):
    __version__ = '1.4'
    # 如果使用全体数据作为字典,则使用这个变量来存放权重,避免重复加载权重,因为每次加载的权重都是一样的。
    train_data_weight = None
    # 验证数据是一份权重,不包含测试集了
    val_data_weight = None

    def __init__(self, feature_encoder, num_filter, num_labels, n_estimators,
                 word2vec_model_file_path, **kwargs):
        self.static_w2v_cnn = None
        self.bow_randomforest = None
        self.feature_encoder = feature_encoder

        if not kwargs.get('init_model', True):
            # 不初始化模型,一般在恢复模型时候用
            return

        if kwargs.get('rand_weight', False):
            # CNN(rand)模式
            weight = None
        elif kwargs['dataset_flag'] == 0:
            # 训练集
            if RFAndWordEmbeddingCnnMerge.train_data_weight is None:
                # 训练集
                RFAndWordEmbeddingCnnMerge.train_data_weight = feature_encoder.to_embedding_weight(
                    word2vec_model_file_path)
            weight = RFAndWordEmbeddingCnnMerge.train_data_weight
        else:
            # kwargs['dataset_flag']>0
            if RFAndWordEmbeddingCnnMerge.val_data_weight is None:
                RFAndWordEmbeddingCnnMerge.val_data_weight = feature_encoder.to_embedding_weight(
                    word2vec_model_file_path)
            weight = RFAndWordEmbeddingCnnMerge.val_data_weight
        # print(weight)
        self.static_w2v_cnn = WordEmbeddingCNN(
            rand_seed=1377,
            verbose=kwargs.get('verbose', 0),
            feature_encoder=feature_encoder,
            # optimizers='adadelta',
            optimizers='sgd',
            # 当使用CNN (rand) 模式的时候使用到了
            word_embedding_dim=50,
            # 设置embedding使用训练好的w2v模型初始化
            embedding_init_weight=weight,
            # 默认设置为训练时embedding层权重不变
            embedding_weight_trainable=kwargs.get('embedding_weight_trainable',
                                                  False),
            num_labels=num_labels,
            l1_conv_filter_type=[
                [num_filter, 3, -1, 'valid', (-1, 1), 0.5, 'relu', 'none'],
                [num_filter, 4, -1, 'valid', (-1, 1), 0., 'relu', 'none'],
                [num_filter, 5, -1, 'valid', (-1, 1), 0., 'relu', 'none'],
            ],
            l2_conv_filter_type=[],
            full_connected_layer_units=[],
            embedding_dropout_rate=0.,
            nb_epoch=kwargs.get('nb_epoch', 25),
            batch_size=kwargs.get('batch_size', 32),
            earlyStoping_patience=30,
            lr=kwargs.get('lr', 1e-2),
            show_validate_accuracy=True
            if kwargs.get('verbose', 0) > 0 else False,
            # output_regularizer=('l2', 0.5),
            output_constraints=('maxnorm', 3),
            # 必须设为True,才能取中间结果做特征
            save_middle_output=True,
        )

        self.bow_randomforest = BowRandomForest(
            rand_seed=1377,
            verbose=kwargs.get('verbose', 0),
            feature_encoder=feature_encoder,
            # optimizers='adadelta',
            n_estimators=n_estimators,
            min_samples_leaf=1,
        )

    def fit(self, train_data=None, validation_data=None):
        train_X, train_y = train_data
        validation_X, validation_y = validation_data

        self.static_w2v_cnn.fit(train_data, validation_data)

        train_x_features = self.static_w2v_cnn.get_layer_output(train_X)[4]

        validation_x_features = self.static_w2v_cnn.get_layer_output(
            validation_X)[4]

        return self.bow_randomforest.fit((train_x_features, train_y),
                                         (validation_x_features, validation_y))

    def save_model(self, path):
        """
            保存模型,保存成pickle形式
        :param path: 模型保存的路径
        :type path: 模型保存的路径
        :return:
        """

        model_file = open(path, 'wb')
        pickle.dump(self.feature_encoder, model_file)
        pickle.dump(self.static_w2v_cnn, model_file)
        pickle.dump(self.bow_randomforest, model_file)

    def model_from_pickle(self, path):
        '''
            从模型文件中直接加载模型
        :param path:
        :return: RandEmbeddingCNN object
        '''

        model_file = file(path, 'rb')
        self.feature_encoder = pickle.load(model_file)
        self.static_w2v_cnn = pickle.load(model_file)
        self.bow_randomforest = pickle.load(model_file)

    @staticmethod
    def get_feature_encoder(**kwargs):
        """
            获取该分类器的特征编码器

        :param kwargs:  可设置参数 [ input_length(*), full_mode(#,False), feature_type(#,word),verbose(#,0)],加*表示必须提供,加#表示可选,不写则默认。
        :return:
        """

        assert kwargs.has_key('input_length'), '请提供 input_length 的属性值'

        from data_processing_util.feature_encoder.onehot_feature_encoder import FeatureEncoder
        feature_encoder = FeatureEncoder(
            need_segmented=kwargs.get('need_segmented', True),
            sentence_padding_length=kwargs['input_length'],
            verbose=kwargs.get('verbose', 0),
            full_mode=kwargs.get('full_mode', False),
            remove_stopword=True,
            replace_number=True,
            lowercase=True,
            zhs2zht=True,
            remove_url=True,
            padding_mode='center',
            add_unkown_word=True,
            feature_type=kwargs.get('feature_type', 'word'),
            vocabulary_including_test_set=kwargs.get(
                'vocabulary_including_test_set', True),
            update_dictionary=kwargs.get('update_dictionary', True))

        return feature_encoder

    def batch_predict_bestn(self, sentences, transform_input=False, bestn=1):
        """
                    批量预测句子的类别,对输入的句子进行预测

                :param sentences: 测试句子,
                :type sentences: array-like
                :param transform_input: 是否转换句子,如果为True,输入原始字符串句子即可,内部已实现转换成字典索引的形式。
                :type transform_input: bool
                :param bestn: 预测,并取出bestn个结果。
                :type bestn: int
                :return: y_pred_result, y_pred_score
                """
        if transform_input:
            sentences = self.static_w2v_cnn.transform(sentences)
        # sentences = np.asarray(sentences)
        # assert len(sentences.shape) == 2, 'shape必须是2维的!'

        train_x_features = self.static_w2v_cnn.get_layer_output(sentences)[4]
        # print(train_x_features)
        # print(train_x_features.shape)

        return self.bow_randomforest.batch_predict_bestn(train_x_features,
                                                         transform_input=False,
                                                         bestn=bestn)
    def __init__(self,
                 feature_encoder,
                 num_filter,
                 num_labels,
                 n_estimators,
                 word2vec_model_file_path,
                 **kwargs
                 ):
        self.static_w2v_cnn = None
        self.bow_randomforest = None
        self.feature_encoder = feature_encoder

        if not kwargs.get('init_model', True):
            # 不初始化模型,一般在恢复模型时候用
            return

        if kwargs.get('rand_weight', False):
            # CNN(rand)模式
            weight = None
        elif kwargs['dataset_flag'] == 0:
            # 训练集
            if RFAndWordEmbeddingCnnMerge.train_data_weight is None:
                # 训练集
                RFAndWordEmbeddingCnnMerge.train_data_weight = feature_encoder.to_embedding_weight(
                    word2vec_model_file_path)
            weight = RFAndWordEmbeddingCnnMerge.train_data_weight
        else:
            # kwargs['dataset_flag']>0
            if RFAndWordEmbeddingCnnMerge.val_data_weight is None:
                RFAndWordEmbeddingCnnMerge.val_data_weight = feature_encoder.to_embedding_weight(
                    word2vec_model_file_path)
            weight = RFAndWordEmbeddingCnnMerge.val_data_weight
        # print(weight)
        self.static_w2v_cnn = WordEmbeddingCNN(
            rand_seed=1377,
            verbose=kwargs.get('verbose', 0),
            feature_encoder=feature_encoder,
            # optimizers='adadelta',
            optimizers='sgd',
            # 当使用CNN (rand) 模式的时候使用到了
            word_embedding_dim=50,
            # 设置embedding使用训练好的w2v模型初始化
            embedding_init_weight=weight,
            # 默认设置为训练时embedding层权重不变
            embedding_weight_trainable=kwargs.get('embedding_weight_trainable', False),
            num_labels=num_labels,
            l1_conv_filter_type=[
                [num_filter, 3, -1, 'valid', (-1, 1), 0.5, 'relu', 'none'],
                [num_filter, 4, -1, 'valid', (-1, 1), 0., 'relu', 'none'],
                [num_filter, 5, -1, 'valid', (-1, 1), 0., 'relu', 'none'],
            ],
            l2_conv_filter_type=[],
            full_connected_layer_units=[],
            embedding_dropout_rate=0.,
            nb_epoch=kwargs.get('nb_epoch', 25),
            batch_size=kwargs.get('batch_size', 32),
            earlyStoping_patience=30,
            lr=kwargs.get('lr', 1e-2),
            show_validate_accuracy=True if kwargs.get('verbose', 0) > 0 else False,
            # output_regularizer=('l2', 0.5),
            output_constraints=('maxnorm', 3),
            # 必须设为True,才能取中间结果做特征
            save_middle_output=True,

        )

        self.bow_randomforest = BowRandomForest(
            rand_seed=1377,
            verbose=kwargs.get('verbose', 0),
            feature_encoder=feature_encoder,
            # optimizers='adadelta',
            n_estimators=n_estimators,
            min_samples_leaf=1,
        )
class RFAndWordEmbeddingCnnMerge(CnnBaseClass):
    __version__ = '1.4'
    # 如果使用全体数据作为字典,则使用这个变量来存放权重,避免重复加载权重,因为每次加载的权重都是一样的。
    train_data_weight = None
    # 验证数据是一份权重,不包含测试集了
    val_data_weight = None

    def __init__(self,
                 feature_encoder,
                 num_filter,
                 num_labels,
                 n_estimators,
                 word2vec_model_file_path,
                 **kwargs
                 ):
        self.static_w2v_cnn = None
        self.bow_randomforest = None
        self.feature_encoder = feature_encoder

        if not kwargs.get('init_model', True):
            # 不初始化模型,一般在恢复模型时候用
            return

        if kwargs.get('rand_weight', False):
            # CNN(rand)模式
            weight = None
        elif kwargs['dataset_flag'] == 0:
            # 训练集
            if RFAndWordEmbeddingCnnMerge.train_data_weight is None:
                # 训练集
                RFAndWordEmbeddingCnnMerge.train_data_weight = feature_encoder.to_embedding_weight(
                    word2vec_model_file_path)
            weight = RFAndWordEmbeddingCnnMerge.train_data_weight
        else:
            # kwargs['dataset_flag']>0
            if RFAndWordEmbeddingCnnMerge.val_data_weight is None:
                RFAndWordEmbeddingCnnMerge.val_data_weight = feature_encoder.to_embedding_weight(
                    word2vec_model_file_path)
            weight = RFAndWordEmbeddingCnnMerge.val_data_weight
        # print(weight)
        self.static_w2v_cnn = WordEmbeddingCNN(
            rand_seed=1377,
            verbose=kwargs.get('verbose', 0),
            feature_encoder=feature_encoder,
            # optimizers='adadelta',
            optimizers='sgd',
            # 当使用CNN (rand) 模式的时候使用到了
            word_embedding_dim=50,
            # 设置embedding使用训练好的w2v模型初始化
            embedding_init_weight=weight,
            # 默认设置为训练时embedding层权重不变
            embedding_weight_trainable=kwargs.get('embedding_weight_trainable', False),
            num_labels=num_labels,
            l1_conv_filter_type=[
                [num_filter, 3, -1, 'valid', (-1, 1), 0.5, 'relu', 'none'],
                [num_filter, 4, -1, 'valid', (-1, 1), 0., 'relu', 'none'],
                [num_filter, 5, -1, 'valid', (-1, 1), 0., 'relu', 'none'],
            ],
            l2_conv_filter_type=[],
            full_connected_layer_units=[],
            embedding_dropout_rate=0.,
            nb_epoch=kwargs.get('nb_epoch', 25),
            batch_size=kwargs.get('batch_size', 32),
            earlyStoping_patience=30,
            lr=kwargs.get('lr', 1e-2),
            show_validate_accuracy=True if kwargs.get('verbose', 0) > 0 else False,
            # output_regularizer=('l2', 0.5),
            output_constraints=('maxnorm', 3),
            # 必须设为True,才能取中间结果做特征
            save_middle_output=True,

        )

        self.bow_randomforest = BowRandomForest(
            rand_seed=1377,
            verbose=kwargs.get('verbose', 0),
            feature_encoder=feature_encoder,
            # optimizers='adadelta',
            n_estimators=n_estimators,
            min_samples_leaf=1,
        )

    def fit(self, train_data=None, validation_data=None):
        train_X, train_y = train_data
        validation_X, validation_y = validation_data

        self.static_w2v_cnn.fit(train_data, validation_data)

        train_x_features = self.static_w2v_cnn.get_layer_output(train_X)[4]

        validation_x_features = self.static_w2v_cnn.get_layer_output(validation_X)[4]

        return self.bow_randomforest.fit((train_x_features, train_y), (validation_x_features, validation_y))

    def save_model(self, path):
        """
            保存模型,保存成pickle形式
        :param path: 模型保存的路径
        :type path: 模型保存的路径
        :return:
        """

        model_file = open(path, 'wb')
        pickle.dump(self.feature_encoder, model_file)
        pickle.dump(self.static_w2v_cnn, model_file)
        pickle.dump(self.bow_randomforest, model_file)

    def model_from_pickle(self, path):
        '''
            从模型文件中直接加载模型
        :param path:
        :return: RandEmbeddingCNN object
        '''

        model_file = file(path, 'rb')
        self.feature_encoder = pickle.load(model_file)
        self.static_w2v_cnn = pickle.load(model_file)
        self.bow_randomforest = pickle.load(model_file)

    @staticmethod
    def get_feature_encoder(**kwargs):
        """
            获取该分类器的特征编码器

        :param kwargs:  可设置参数 [ input_length(*), full_mode(#,False), feature_type(#,word),verbose(#,0)],加*表示必须提供,加#表示可选,不写则默认。
        :return:
        """

        assert kwargs.has_key('input_length'), '请提供 input_length 的属性值'

        from data_processing_util.feature_encoder.onehot_feature_encoder import FeatureEncoder
        feature_encoder = FeatureEncoder(
            need_segmented=kwargs.get('need_segmented', True),
            sentence_padding_length=kwargs['input_length'],
            verbose=kwargs.get('verbose', 0),
            full_mode=kwargs.get('full_mode', False),
            remove_stopword=True,
            replace_number=True,
            lowercase=True,
            zhs2zht=True,
            remove_url=True,
            padding_mode='center',
            add_unkown_word=True,
            feature_type=kwargs.get('feature_type', 'word'),
            vocabulary_including_test_set=kwargs.get('vocabulary_including_test_set', True),
            update_dictionary=kwargs.get('update_dictionary', True)
        )

        return feature_encoder

    def batch_predict_bestn(self, sentences, transform_input=False, bestn=1):
        """
                    批量预测句子的类别,对输入的句子进行预测

                :param sentences: 测试句子,
                :type sentences: array-like
                :param transform_input: 是否转换句子,如果为True,输入原始字符串句子即可,内部已实现转换成字典索引的形式。
                :type transform_input: bool
                :param bestn: 预测,并取出bestn个结果。
                :type bestn: int
                :return: y_pred_result, y_pred_score
                """
        if transform_input:
            sentences = self.static_w2v_cnn.transform(sentences)
        # sentences = np.asarray(sentences)
        # assert len(sentences.shape) == 2, 'shape必须是2维的!'

        train_x_features = self.static_w2v_cnn.get_layer_output(sentences)[4]
        # print(train_x_features)
        # print(train_x_features.shape)

        return self.bow_randomforest.batch_predict_bestn(train_x_features, transform_input=False, bestn=bestn)
Exemple #15
0
cv_data = data_util.get_k_fold_data(k=3,
                                    data=train_data,
                                    rand_seed=0,
                                    )

WordEmbeddingCNN.cross_validation(
    cv_data,
    (test_data[u'SENTENCE'].as_matrix(), test_y),
    'result/static_W2V_%s_cv_detail.txt'%feature_type,
    rand_seed=rand_seed,
    nb_epoch=nb_epoch,
    verbose=verbose,
    feature_type=feature_type,
    full_mode=False,
    layer1=layer1,
    l1_conv_filter_type=l1_conv_filter_type,
    layer2=layer2,
    l2_conv_filter_type=l2_conv_filter_type,
    k=k,
    hidden1=hidden1,
    hidden2=hidden2,
    word_embedding_dim = word_embedding_dim,
    sentence_padding_length = sentence_padding_length,
    word2vec_model_file_path=data_util.transform_word2vec_model_name('%dd_weibo_100w'%word_embedding_dim),
    embedding_weight_trainable = True,
)


end_time = timeit.default_timer()
print 'end! Running time:%ds!' % (end_time - start_time)
logging.debug('=' * 20)