コード例 #1
0
    def cross_validation(
        train_data=None,
        test_data=None,
        cv_data=None,
        input_length=None,
        bow_num_filter_list=None,
        w2v_num_filter_list=None,
        bow_region_size_list=None,
        verbose=0,
        word2vec_model_file_path=None,
    ):

        from data_processing_util.cross_validation_util import transform_cv_data, get_k_fold_data, get_val_score
        # 1. 获取交叉验证的数据
        if cv_data is None:
            assert train_data is not None, 'cv_data和train_data必须至少提供一个!'
            cv_data = get_k_fold_data(
                k=3,
                train_data=train_data,
                test_data=test_data,
                include_train_data=True,
            )

        # 2. 将数据进行特征编码转换
        feature_encoder = BowWordEmbeddingMergeCNN.get_feature_encoder(
            input_length=input_length,
            verbose=0,
            feature_type='word',
        )

        cv_data = transform_cv_data(feature_encoder, cv_data, verbose=0)
        # 交叉验证
        for bow_num_filter in bow_num_filter_list:
            for bow_region_size in bow_region_size_list:
                for w2v_num_filter in w2v_num_filter_list:

                    print('=' * 40)
                    print(
                        'bow_num_filter,bow_region_size and w2v_num_filter is %d,%d,%d.'
                        % (bow_num_filter, bow_region_size, w2v_num_filter))
                    get_val_score(
                        BowWordEmbeddingMergeCNNWithOneConv,
                        cv_data=cv_data,
                        verbose=verbose,
                        bow_num_filter=bow_num_filter,
                        bow_region_size=bow_region_size,
                        w2v_num_filter=w2v_num_filter,
                        num_labels=24,
                        word2vec_model_file_path=word2vec_model_file_path,
                    )
コード例 #2
0
    def cross_validation(
            train_data=None,
            test_data=None,
            cv_data=None,
            input_length =None,
            bow_num_filter_list=None,
            w2v_num_filter_list=None,
            bow_region_size_list = None,
            verbose = 0,
            word2vec_model_file_path = None,
           ):

        from data_processing_util.cross_validation_util import transform_cv_data, get_k_fold_data, get_val_score
        # 1. 获取交叉验证的数据
        if cv_data is None:
            assert train_data is not None, 'cv_data和train_data必须至少提供一个!'
            cv_data = get_k_fold_data(
                k=3,
                train_data=train_data,
                test_data=test_data,
                include_train_data=True,
            )

        # 2. 将数据进行特征编码转换
        feature_encoder = BowWordEmbeddingMergeCNN.get_feature_encoder(
            input_length=input_length,
            verbose=0,
            feature_type='word',
        )


        cv_data = transform_cv_data(feature_encoder, cv_data,verbose=0)
        # 交叉验证
        for bow_num_filter in bow_num_filter_list:
            for bow_region_size in bow_region_size_list:
                for w2v_num_filter in w2v_num_filter_list:

                    print('=' * 40)
                    print('bow_num_filter,bow_region_size and w2v_num_filter is %d,%d,%d.'%(bow_num_filter,bow_region_size,w2v_num_filter))
                    get_val_score(BowWordEmbeddingMergeCNNWithOneConv,
                                  cv_data=cv_data,
                                  verbose=verbose,
                                  bow_num_filter = bow_num_filter,
                                  bow_region_size = bow_region_size,
                                  w2v_num_filter = w2v_num_filter,
                                  num_labels=24,
                                  word2vec_model_file_path = word2vec_model_file_path,
                                  )
コード例 #3
0
    def cross_validation(
        train_data=None,
        test_data=None,
        cv_data=None,
        input_length=None,
        conv1_num_filter_list=None,
        conv2_num_filter_list=None,
        verbose=0,
        word2vec_model_file_path=None,
    ):

        from data_processing_util.cross_validation_util import transform_cv_data, get_k_fold_data, get_val_score
        # 1. 获取交叉验证的数据
        if cv_data is None:
            assert train_data is not None, 'cv_data和train_data必须至少提供一个!'
            cv_data = get_k_fold_data(
                k=3,
                train_data=train_data,
                test_data=test_data,
                include_train_data=True,
            )

        # 2. 将数据进行特征编码转换
        feature_encoder = DCNN.get_feature_encoder(
            input_length=input_length,
            verbose=0,
            full_mode=False,
            feature_type='word',
        )
        cv_data = transform_cv_data(feature_encoder, cv_data, verbose=0)
        # 交叉验证
        for conv1_num_filter in conv1_num_filter_list:
            for conv2_num_filter in conv2_num_filter_list:
                print('=' * 40)
                print('num_filter of conv1 and conv2 is %d,%d .' %
                      (conv1_num_filter, conv2_num_filter))
                get_val_score(
                    DcnnAcl,
                    cv_data=cv_data,
                    verbose=verbose,
                    conv1_num_filter=conv1_num_filter,
                    conv2_num_filter=conv2_num_filter,
                    num_labels=24,
                    word2vec_model_file_path=word2vec_model_file_path,
                )
コード例 #4
0
    def cross_validation(
        train_data=None,
        test_data=None,
        cv_data=None,
        input_length=None,
        feature_type='word',
        num_filter_list=None,
        region_size_list=None,
        word2vec_to_solve_oov=False,
        word2vec_model_file_path=None,
        verbose=0,
    ):

        from data_processing_util.cross_validation_util import transform_cv_data, get_k_fold_data, get_val_score
        # 1. 获取交叉验证的数据
        if cv_data is None:
            assert train_data is not None, 'cv_data和train_data必须至少提供一个!'
            cv_data = get_k_fold_data(
                k=3,
                train_data=train_data,
                test_data=test_data,
                include_train_data=True,
            )

        # 2. 将数据进行特征编码转换
        feature_encoder = OnehotBowCNN.get_feature_encoder(
            input_length=input_length,
            verbose=verbose,
            feature_type=feature_type,
            word2vec_to_solve_oov=word2vec_to_solve_oov,
            word2vec_model_file_path=word2vec_model_file_path,
        )
        cv_data = transform_cv_data(feature_encoder, cv_data, verbose=0)
        # 交叉验证
        for num_filter in num_filter_list:
            for region_size in region_size_list:
                print('=' * 40)
                print('num_filter and region_size is %d,%d.' %
                      (num_filter, region_size))
                get_val_score(OnehotCNNWithOneConv,
                              cv_data=cv_data,
                              verbose=verbose,
                              region_size=region_size,
                              num_filter=num_filter,
                              num_labels=24)
コード例 #5
0
    def cross_validation(
            train_data=None,
            test_data=None,
            cv_data=None,
            input_length =None,
            feature_type = 'word',
            num_filter_list=None,
            region_size_list=None,
            word2vec_to_solve_oov = False,
            word2vec_model_file_path = None,
            verbose = 0,
           ):

        from data_processing_util.cross_validation_util import transform_cv_data, get_k_fold_data, get_val_score
        # 1. 获取交叉验证的数据
        if cv_data is None:
            assert train_data is not None, 'cv_data和train_data必须至少提供一个!'
            cv_data = get_k_fold_data(
                k=3,
                train_data=train_data,
                test_data=test_data,
                include_train_data=True,
            )

        # 2. 将数据进行特征编码转换
        feature_encoder = OnehotBowCNN.get_feature_encoder(
            input_length=input_length,
            verbose=verbose,
            feature_type=feature_type,
            word2vec_to_solve_oov = word2vec_to_solve_oov,
            word2vec_model_file_path=word2vec_model_file_path,
        )
        cv_data = transform_cv_data(feature_encoder, cv_data,verbose=0)
        # 交叉验证
        for num_filter in num_filter_list:
            for region_size in region_size_list:
                print('=' * 40)
                print('num_filter and region_size is %d,%d.'%(num_filter,region_size))
                get_val_score(OnehotCNNWithOneConv,
                              cv_data=cv_data,
                              verbose=verbose,
                              region_size = region_size,
                              num_filter=num_filter,
                              num_labels=24
                              )
コード例 #6
0
    def cross_validation(
        train_data=None,
        test_data=None,
        cv_data=None,
        shuffle_data=True,
        n_estimators_list=None,
        feature_type='word',
        word2vec_to_solve_oov=False,
        word2vec_model_file_path=None,
        verbose=0,
        cv=3,
        need_transform_input=True,
        need_segmented=True,
        need_validation=True,
        include_train_data=True,
    ):
        """进行参数的交叉验证

        Parameters
        ----------
        word2vec_model_file_path : str
            word2vec模型路径
        train_data : (array-like,array-like)
            训练数据 (train_X,train_y)
        test_data : (array-like,array-like)
            测试数据 (test_X,test_y)
        cv_data : array-like
            k份验证数据
        word2vec_to_solve_oov : bool
            是否使用 w2v 去替换
        n_estimators_list : array-like
            验证参数,随机森林棵树
        feature_type : str
            特征类型, only in ['word','seg','word_seg']
        shuffle_data : bool
            是否打乱数据
        verbose : int
            数值越大,输出越详细
        cv:int
            进行 cv 折验证
        need_transform_input : bool
            是否需要转换数据
        need_segmented:bool
            是否需要分词
        include_train_data:
            是否包含训练数据一样验证
        need_validation:
            是否要验证
        """

        from data_processing_util.cross_validation_util import transform_cv_data, get_k_fold_data, get_val_score
        # 1. 获取交叉验证的数据
        if cv_data is None:
            assert train_data is not None, 'cv_data和train_data必须至少提供一个!'
            cv_data = get_k_fold_data(
                k=cv,
                train_data=train_data,
                test_data=test_data,
                include_train_data=include_train_data,
            )
        # region 2. 将数据进行特征编码转换
        if need_transform_input:
            feature_encoder = BowRandomForest.get_feature_encoder(
                verbose=verbose,
                need_segmented=need_segmented,
                feature_type=feature_type,
                word2vec_to_solve_oov=word2vec_to_solve_oov,
                word2vec_model_file_path=word2vec_model_file_path,
            )
            # diff_train_val_feature_encoder=1 每次feature encoder 都不同
            cv_data = transform_cv_data(feature_encoder,
                                        cv_data,
                                        verbose=verbose,
                                        diff_train_val_feature_encoder=1)
        else:
            if len(cv_data[0]) < 6:
                # cv_data 每项都需要 6项, 不够则补齐
                cv_data = [item + [None] for item in cv_data]
        # endregion

        # region 3. 交叉验证
        for n_estimators in n_estimators_list:
            print('=' * 40)
            print('n_estimators is %d.' % n_estimators)
            get_val_score(
                BowRandomForest,
                cv_data=cv_data[:],
                verbose=verbose,
                shuffle_data=shuffle_data,
                need_validation=need_validation,
                n_estimators=n_estimators,
            )
コード例 #7
0
    def cross_validation(
            train_data=None,
            test_data=None,
            cv_data=None,
            feature_type='word',
            input_length=None,
            num_filter_list=None,
            verbose=0,
            cv=3,
            batch_size=32,
            lr=1e-2,
            need_segmented=True,
            word2vec_model_file_path=None,
            num_labels=24,
            embedding_weight_trainable=False,
            # 获取中间层输出
            get_cnn_middle_layer_output=False,
            middle_layer_output_file=None,
            rand_weight=False,
            need_validation=True,
            include_train_data=True,
            vocabulary_including_test_set=True,
    ):
        """

        Parameters
        ----------
        train_data : array-like
            训练数据 (train_X, train_y))
        test_data : array-like
            测试数据
        cv_data : array-like
            k份验证数据
        input_length : int
            输入长度
        num_filter_list : array-like
            验证参数,number of filters
        middle_layer_output_file : str
            中间层输出到哪个文件
        get_cnn_middle_layer_output : bool
            是否获取中间层输出(#,False)
        num_labels: int
            标签
        batch_size : int
            batch size
        vocabulary_including_test_set: bool,default,True
            字典是否包括测试集
        include_train_data : bool
            是否包含训练数据一样验证
        need_validation: bool
            是否要验证
        embedding_weight_trainable : bool
            切换 CNN(static-w2v) 和 CNN(non-static-w2v)
        rand_weight : bool
            切换 CNN(rand) or CNN(static/non-static-w2v)
        feature_type : str
            特征类型
        verbose : int
            数值越大,输出越详细
        cv:int
            进行 cv 折验证
        need_segmented:bool
            是否需要分词
        word2vec_model_file_path

        Notes
        ----------
        - 为了提高效率,默认设置 update_dictionary = False ,以保证feature encoder的字典一致,避免重复构造字典
        - 同时设置 diff_train_val_feature_encoder=1 来保证训练集上和验证集上的feature encoder 不同,因为字典大小不同

        Examples
        ----------
        >>> train_x = ['你好', '测试句子', '我要买手机', '今天天气不错', '无聊']
        >>> train_y = [1, 2, 3, 2, 3]
        >>> test_x = ['你好', '不错哟']
        >>> test_y = [1, 2]
        >>> cv_x = [['你好', '无聊'], ['测试句子', '今天天气不错'], ['我要买手机']]
        >>> cv_y = [[1, 3], [2, 2], [3]]
        >>> WordEmbeddingCNNWithOneConv.cross_validation(
        >>>         train_data = (train_x,train_y),
        >>>         test_data=(test_x,test_y),
        >>>         input_length=8,
        >>>         num_filter_list=[5,50],
        >>>         verbose=1,
        >>>         word2vec_model_file_path = '/home/jdwang/PycharmProjects/nlp_util/data_processing_util/word2vec_util/vector/50dim/vector1000000_50dim.gem',
        >>>     )

        """
        print('=' * 80)
        print('feature_type: %s, need_segmented: %s, vocabulary_including_test_set: %s' % (feature_type,
                                                                                      need_segmented,
                                                                                      vocabulary_including_test_set))
        print('input_length: %d, num_labels: %d' % (input_length, num_labels))
        print('lr: %f, batch_size: %d, rand_weight: %s, embedding_weight_trainable: %s' % (lr,batch_size, rand_weight, embedding_weight_trainable))
        if not rand_weight:
            print('W2V model file_path: %s' % word2vec_model_file_path)
        print('=' * 80)

        from data_processing_util.cross_validation_util import transform_cv_data, get_k_fold_data, get_val_score

        # 1. 获取交叉验证的数据
        if cv_data is None:
            assert train_data is not None, 'cv_data和train_data必须至少提供一个!'
            cv_data = get_k_fold_data(
                k=cv,
                train_data=train_data,
                test_data=test_data,
                include_train_data=include_train_data,
            )

        # 2. 将数据进行特征编码转换
        feature_encoder = WordEmbeddingCNN.get_feature_encoder(
            need_segmented=need_segmented,
            input_length=input_length,
            verbose=1,
            feature_type=feature_type,
            padding_mode='center',
            # 设置字典保持一致
            update_dictionary=False,
            vocabulary_including_test_set=vocabulary_including_test_set,
        )

        cv_data = transform_cv_data(feature_encoder, cv_data, verbose=verbose, diff_train_val_feature_encoder=1)

        # 交叉验证
        for num_filter in num_filter_list:
            print('=' * 40)
            print('num_filter is %d.' % num_filter)
            _, _, middle_output_dev, middle_output_val = get_val_score(
                WordEmbeddingCNNWithOneConv,
                cv_data=cv_data[:],
                verbose=verbose,
                num_filter=num_filter,
                num_labels=num_labels,
                word2vec_model_file_path=word2vec_model_file_path,
                embedding_weight_trainable=embedding_weight_trainable,
                get_cnn_middle_layer_output=get_cnn_middle_layer_output,
                need_validation=need_validation,
                rand_weight=rand_weight,
                batch_size=batch_size,
                lr=lr,
            )

            if get_cnn_middle_layer_output:
                # 保存结果
                with open(middle_layer_output_file, 'w') as fout:
                    # 保存中间结果
                    pickle.dump(cv_data, fout)
                    pickle.dump(middle_output_dev, fout)
                    pickle.dump(middle_output_val, fout)
コード例 #8
0
    def cross_validation(
        train_data=None,
        test_data=None,
        cv_data=None,
        feature_type='word',
        input_length=None,
        num_filter_list=None,
        verbose=0,
        cv=3,
        batch_size=32,
        lr=1e-2,
        need_segmented=True,
        word2vec_model_file_path=None,
        num_labels=24,
        embedding_weight_trainable=False,
        shuffle_data=True,
        rand_weight=False,
        need_validation=True,
        include_train_data=True,
        vocabulary_including_test_set=True,
        n_estimators_list=None,
    ):

        print('=' * 80)
        print(
            'feature_type:%s,need_segmented:%s,vocabulary_including_test_set:%s'
            % (feature_type, need_segmented, vocabulary_including_test_set))
        print('rand_weight:%s,embedding_weight_trainable:%s' %
              (rand_weight, embedding_weight_trainable))
        print('=' * 80)

        from data_processing_util.cross_validation_util import transform_cv_data, get_k_fold_data, get_val_score

        # 1. 获取交叉验证的数据
        if cv_data is None:
            assert train_data is not None, 'cv_data和train_data必须至少提供一个!'
            cv_data = get_k_fold_data(
                k=cv,
                train_data=train_data,
                test_data=test_data,
                include_train_data=include_train_data,
            )

        # 2. 将数据进行特征编码转换
        feature_encoder = WordEmbeddingCNN.get_feature_encoder(
            need_segmented=need_segmented,
            input_length=input_length,
            verbose=1,
            feature_type=feature_type,
            # 设置字典保持一致
            update_dictionary=False,
            vocabulary_including_test_set=vocabulary_including_test_set,
        )

        cv_data = transform_cv_data(feature_encoder,
                                    cv_data,
                                    verbose=verbose,
                                    diff_train_val_feature_encoder=1)

        # 交叉验证
        for num_filter in num_filter_list:
            for n_estimators in n_estimators_list:
                print('=' * 40)
                print('num_filter and n_estimators is %d,%d.' %
                      (num_filter, n_estimators))
                get_val_score(
                    RFAndRFAndWordEmbeddingCnnMerge,
                    num_filter=num_filter,
                    n_estimators=n_estimators,
                    cv_data=cv_data[:],
                    verbose=verbose,
                    num_labels=num_labels,
                    word2vec_model_file_path=word2vec_model_file_path,
                    embedding_weight_trainable=embedding_weight_trainable,
                    need_validation=need_validation,
                    rand_weight=rand_weight,
                    batch_size=batch_size,
                    lr=lr,
                    shuffle_data=shuffle_data,
                )
コード例 #9
0
    def cross_validation(cv_data, test_data, result_file_path, **kwargs):
        """
            进行参数的交叉验证

        :param cv_data: k份训练数据
        :type cv_data: array-like
        :param test_data: 测试数据
        :type test_data: array-like
        :return:
        """

        nb_epoch = kwargs['nb_epoch']
        verbose = kwargs['verbose']
        num_labels = kwargs['num_labels']
        word_input_length, seg_input_length = 10, 7
        remove_stopword = kwargs['remove_stopword']
        word2vec_to_solve_oov = kwargs['word2vec_to_solve_oov']
        rand_seed = kwargs['rand_seed']
        l1_conv_filter_type = kwargs['l1_conv_filter_type']
        l2_conv_filter_type = kwargs['l2_conv_filter_type']
        k = kwargs['k']
        lr = kwargs['lr']

        use_layer = kwargs['use_layer']

        layer1 = kwargs['layer1'] if kwargs.get('layer1', []) !=[] else [-1]
        layer2 = kwargs['layer2'] if kwargs.get('layer2', []) !=[] else [-1]
        hidden1 = kwargs['hidden1'] if kwargs.get('hidden1', []) !=[] else [-1]
        hidden2 = kwargs['hidden2'] if kwargs.get('hidden2', []) !=[] else [-1]

        # 详细结果保存到...
        detail_result_file_path = result_file_path
        fout = open(detail_result_file_path, 'w')
        print('=' * 150)
        print('调节的参数....')
        print('use_layer:%s'%use_layer)
        print('layer1:%s' % str(layer1))
        print('layer2:%s' % str(layer2))
        print('hidden1:%s' % str(hidden1))
        print('hidden2:%s' % str(hidden2))
        print('-' * 150)
        print('word_input_length:%d\nseg_input_length:%d' % (word_input_length, seg_input_length))
        print('使用word2vec:%s\nremove_stopword:%s\nnb_epoch:%d\nrand_seed:%d' % (
            word2vec_to_solve_oov, remove_stopword, nb_epoch, rand_seed))
        print('l1_conv_filter_type:%s' % l1_conv_filter_type)
        print('l2_conv_filter_type:%s' % l2_conv_filter_type)
        print('k:%s' % k)
        print('=' * 150)

        fout.write('=' * 150 + '\n')
        fout.write('cv结果:\n')
        fout.write('lr:%f\nnb_epoch:%d\nrand_seed:%d\n' % (lr,nb_epoch, rand_seed))
        fout.write('l1_conv_filter_type:%s\n' % l1_conv_filter_type)
        fout.write('l2_conv_filter_type:%s\n' % l2_conv_filter_type)
        fout.write('k:%s\n' % k)
        fout.write('=' * 150 + '\n')

        from data_processing_util.cross_validation_util import transform_cv_data,get_val_score
        word_feature_encoder,seg_feature_encoder = MultiChannelOnehotBowCNN.get_feature_encoder(
           ** {'word_input_length':word_input_length,
             'seg_input_length':seg_input_length}
        )


        all_cv_word_data = transform_cv_data(word_feature_encoder, cv_data, test_data, **kwargs)
        all_cv_seg_data = transform_cv_data(seg_feature_encoder, cv_data, test_data, **kwargs)
        cv_data = [([dev_word_X,dev_seg_X],dev_y,[val_word_X,val_seg_X],val_y,(word_feature_encoder,seg_feature_encoder)) for (dev_word_X, dev_y, val_word_X, val_y,word_feature_encoder),(dev_seg_X, dev_y, val_seg_X, val_y,seg_feature_encoder) in zip(all_cv_word_data,all_cv_seg_data)]

        # 交叉验证
        parmater = product(layer1, layer2, hidden1, hidden2)

        for l1,l2,h1,h2 in parmater:

            fout.write('=' * 150 + '\n')
            fout.write('layer1:%d,layer2:%d,hidden1:%d,hidden2:%d\n' % (l1, l2, h1, h2))
            print('layer1:%d,layer2:%d,hidden1:%d,hidden2:%d' % (l1,l2,h1,h2))

            l1_conv_filter =[]
            if 'conv1' in use_layer:
                l1_conv_filter.extend([
                    [l1, l1_conv_filter_type[0][0], -1, l1_conv_filter_type[0][1], (0, 1), 0., 'relu', 'none'],
                    [l1, l1_conv_filter_type[1][0], -1, l1_conv_filter_type[1][1], (0, 1), 0., 'relu', 'none'],
                    [l1, l1_conv_filter_type[2][0], -1, l1_conv_filter_type[2][1], (0, 1), 0., 'relu', 'none'],
                ])

            full_connected_layer_units = []

            if 'hidden1' in use_layer:
                full_connected_layer_units.append([h1, 0., 'relu', 'none'])

            parm = {'l1_conv_filter_type':l1_conv_filter,
                    'full_connected_layer_units':full_connected_layer_units,
                    'num_labels':num_labels,
                    'verbose':verbose,
                    'nb_epoch':nb_epoch,
                    'lr':lr
                    }
            get_val_score(MultiChannelOnehotBowCNN,cv_data,fout,**parm)



        fout.close()
コード例 #10
0
ファイル: bow_rf_model.py プロジェクト: JDwangmo/nlp_util
    def cross_validation(
            train_data=None,
            test_data=None,
            cv_data=None,
            shuffle_data=True,
            n_estimators_list=None,
            feature_type='word',
            word2vec_to_solve_oov=False,
            word2vec_model_file_path=None,
            verbose=0,
            cv=3,
            need_transform_input=True,
            need_segmented=True,
            need_validation=True,
            include_train_data=True,
    ):
        """进行参数的交叉验证

        Parameters
        ----------
        word2vec_model_file_path : str
            word2vec模型路径
        train_data : (array-like,array-like)
            训练数据 (train_X,train_y)
        test_data : (array-like,array-like)
            测试数据 (test_X,test_y)
        cv_data : array-like
            k份验证数据
        word2vec_to_solve_oov : bool
            是否使用 w2v 去替换
        n_estimators_list : array-like
            验证参数,随机森林棵树
        feature_type : str
            特征类型, only in ['word','seg','word_seg']
        shuffle_data : bool
            是否打乱数据
        verbose : int
            数值越大,输出越详细
        cv:int
            进行 cv 折验证
        need_transform_input : bool
            是否需要转换数据
        need_segmented:bool
            是否需要分词
        include_train_data:
            是否包含训练数据一样验证
        need_validation:
            是否要验证
        """

        from data_processing_util.cross_validation_util import transform_cv_data, get_k_fold_data, get_val_score
        # 1. 获取交叉验证的数据
        if cv_data is None:
            assert train_data is not None, 'cv_data和train_data必须至少提供一个!'
            cv_data = get_k_fold_data(
                k=cv,
                train_data=train_data,
                test_data=test_data,
                include_train_data=include_train_data,
            )
        # region 2. 将数据进行特征编码转换
        if need_transform_input:
            feature_encoder = BowRandomForest.get_feature_encoder(
                verbose=verbose,
                need_segmented=need_segmented,
                feature_type=feature_type,
                word2vec_to_solve_oov=word2vec_to_solve_oov,
                word2vec_model_file_path=word2vec_model_file_path,
            )
            # diff_train_val_feature_encoder=1 每次feature encoder 都不同
            cv_data = transform_cv_data(feature_encoder, cv_data, verbose=verbose, diff_train_val_feature_encoder=1)
        else:
            if len(cv_data[0]) < 6:
                # cv_data 每项都需要 6项, 不够则补齐
                cv_data = [item + [None] for item in cv_data]
        # endregion

        # region 3. 交叉验证
        for n_estimators in n_estimators_list:
            print('=' * 40)
            print('n_estimators is %d.' % n_estimators)
            get_val_score(BowRandomForest,
                          cv_data=cv_data[:],
                          verbose=verbose,
                          shuffle_data=shuffle_data,
                          need_validation=need_validation,
                          n_estimators=n_estimators,
                          )
コード例 #11
0
    def cross_validation(
        train_data=None,
        test_data=None,
        cv_data=None,
        feature_type='word',
        input_length=None,
        num_filter_list=None,
        verbose=0,
        cv=3,
        batch_size=32,
        lr=1e-2,
        need_segmented=True,
        word2vec_model_file_path=None,
        num_labels=24,
        embedding_weight_trainable=False,
        # 获取中间层输出
        get_cnn_middle_layer_output=False,
        middle_layer_output_file=None,
        rand_weight=False,
        need_validation=True,
        include_train_data=True,
        vocabulary_including_test_set=True,
    ):
        """

        Parameters
        ----------
        train_data : array-like
            训练数据 (train_X, train_y))
        test_data : array-like
            测试数据
        cv_data : array-like
            k份验证数据
        input_length : int
            输入长度
        num_filter_list : array-like
            验证参数,number of filters
        middle_layer_output_file : str
            中间层输出到哪个文件
        get_cnn_middle_layer_output : bool
            是否获取中间层输出(#,False)
        num_labels: int
            标签
        batch_size : int
            batch size
        vocabulary_including_test_set: bool,default,True
            字典是否包括测试集
        include_train_data : bool
            是否包含训练数据一样验证
        need_validation: bool
            是否要验证
        embedding_weight_trainable : bool
            切换 CNN(static-w2v) 和 CNN(non-static-w2v)
        rand_weight : bool
            切换 CNN(rand) or CNN(static/non-static-w2v)
        feature_type : str
            特征类型
        verbose : int
            数值越大,输出越详细
        cv:int
            进行 cv 折验证
        need_segmented:bool
            是否需要分词
        word2vec_model_file_path

        Notes
        ----------
        - 为了提高效率,默认设置 update_dictionary = False ,以保证feature encoder的字典一致,避免重复构造字典
        - 同时设置 diff_train_val_feature_encoder=1 来保证训练集上和验证集上的feature encoder 不同,因为字典大小不同

        Examples
        ----------
        >>> train_x = ['你好', '测试句子', '我要买手机', '今天天气不错', '无聊']
        >>> train_y = [1, 2, 3, 2, 3]
        >>> test_x = ['你好', '不错哟']
        >>> test_y = [1, 2]
        >>> cv_x = [['你好', '无聊'], ['测试句子', '今天天气不错'], ['我要买手机']]
        >>> cv_y = [[1, 3], [2, 2], [3]]
        >>> WordEmbeddingCNNWithOneConv.cross_validation(
        >>>         train_data = (train_x,train_y),
        >>>         test_data=(test_x,test_y),
        >>>         input_length=8,
        >>>         num_filter_list=[5,50],
        >>>         verbose=1,
        >>>         word2vec_model_file_path = '/home/jdwang/PycharmProjects/nlp_util/data_processing_util/word2vec_util/vector/50dim/vector1000000_50dim.gem',
        >>>     )

        """
        print('=' * 80)
        print(
            'feature_type: %s, need_segmented: %s, vocabulary_including_test_set: %s'
            % (feature_type, need_segmented, vocabulary_including_test_set))
        print('input_length: %d, num_labels: %d' % (input_length, num_labels))
        print(
            'lr: %f, batch_size: %d, rand_weight: %s, embedding_weight_trainable: %s'
            % (lr, batch_size, rand_weight, embedding_weight_trainable))
        if not rand_weight:
            print('W2V model file_path: %s' % word2vec_model_file_path)
        print('=' * 80)

        from data_processing_util.cross_validation_util import transform_cv_data, get_k_fold_data, get_val_score

        # 1. 获取交叉验证的数据
        if cv_data is None:
            assert train_data is not None, 'cv_data和train_data必须至少提供一个!'
            cv_data = get_k_fold_data(
                k=cv,
                train_data=train_data,
                test_data=test_data,
                include_train_data=include_train_data,
            )

        # 2. 将数据进行特征编码转换
        feature_encoder = WordEmbeddingCNN.get_feature_encoder(
            need_segmented=need_segmented,
            input_length=input_length,
            verbose=1,
            feature_type=feature_type,
            padding_mode='center',
            # 设置字典保持一致
            update_dictionary=False,
            vocabulary_including_test_set=vocabulary_including_test_set,
        )

        cv_data = transform_cv_data(feature_encoder,
                                    cv_data,
                                    verbose=verbose,
                                    diff_train_val_feature_encoder=1)

        # 交叉验证
        for num_filter in num_filter_list:
            print('=' * 40)
            print('num_filter is %d.' % num_filter)
            _, _, middle_output_dev, middle_output_val = get_val_score(
                WordEmbeddingCNNWithOneConv,
                cv_data=cv_data[:],
                verbose=verbose,
                num_filter=num_filter,
                num_labels=num_labels,
                word2vec_model_file_path=word2vec_model_file_path,
                embedding_weight_trainable=embedding_weight_trainable,
                get_cnn_middle_layer_output=get_cnn_middle_layer_output,
                need_validation=need_validation,
                rand_weight=rand_weight,
                batch_size=batch_size,
                lr=lr,
            )

            if get_cnn_middle_layer_output:
                # 保存结果
                with open(middle_layer_output_file, 'w') as fout:
                    # 保存中间结果
                    pickle.dump(cv_data, fout)
                    pickle.dump(middle_output_dev, fout)
                    pickle.dump(middle_output_val, fout)
コード例 #12
0
    def cross_validation(
            train_data=None,
            test_data=None,
            cv_data=None,
            feature_type='word',
            input_length=None,
            num_filter_list=None,
            verbose=0,
            cv=3,
            batch_size=32,
            lr=1e-2,
            need_segmented=True,
            word2vec_model_file_path=None,
            num_labels=24,
            embedding_weight_trainable=False,
            shuffle_data=True,
            rand_weight=False,
            need_validation=True,
            include_train_data=True,
            vocabulary_including_test_set=True,
            n_estimators_list=None,
    ):

        print('=' * 80)
        print('feature_type:%s,need_segmented:%s,vocabulary_including_test_set:%s' % (feature_type,
                                                                                      need_segmented,
                                                                                      vocabulary_including_test_set))
        print('rand_weight:%s,embedding_weight_trainable:%s' % (rand_weight, embedding_weight_trainable))
        print('=' * 80)

        from data_processing_util.cross_validation_util import transform_cv_data, get_k_fold_data, get_val_score

        # 1. 获取交叉验证的数据
        if cv_data is None:
            assert train_data is not None, 'cv_data和train_data必须至少提供一个!'
            cv_data = get_k_fold_data(
                k=cv,
                train_data=train_data,
                test_data=test_data,
                include_train_data=include_train_data,
            )

        # 2. 将数据进行特征编码转换
        feature_encoder = RFAndWordEmbeddingCnnMerge.get_feature_encoder(
            need_segmented=need_segmented,
            input_length=input_length,
            verbose=1,
            feature_type=feature_type,
            # 设置字典保持一致
            update_dictionary=False,
            vocabulary_including_test_set=vocabulary_including_test_set,
        )
        # 转换数据
        # diff_train_val_feature_encoder=1 ----> 训练集和验证集的 feature_encoder 字典 强制不一样。
        cv_data = transform_cv_data(feature_encoder, cv_data, verbose=verbose, shuffle_data=shuffle_data,
                                    diff_train_val_feature_encoder=1)

        # 交叉验证
        for num_filter in num_filter_list:
            for n_estimators in n_estimators_list:
                print('=' * 40)
                print('num_filter and n_estimators is %d,%d.' % (num_filter, n_estimators))
                get_val_score(RFAndRFAndWordEmbeddingCnnMerge,
                              num_filter=num_filter,
                              n_estimators=n_estimators,
                              cv_data=cv_data[:],
                              verbose=verbose,
                              num_labels=num_labels,
                              word2vec_model_file_path=word2vec_model_file_path,
                              embedding_weight_trainable=embedding_weight_trainable,
                              need_validation=need_validation,
                              rand_weight=rand_weight,
                              batch_size=batch_size,
                              lr=lr,
                              )