def cross_validation( train_data=None, test_data=None, cv_data=None, input_length=None, bow_num_filter_list=None, w2v_num_filter_list=None, bow_region_size_list=None, verbose=0, word2vec_model_file_path=None, ): from data_processing_util.cross_validation_util import transform_cv_data, get_k_fold_data, get_val_score # 1. 获取交叉验证的数据 if cv_data is None: assert train_data is not None, 'cv_data和train_data必须至少提供一个!' cv_data = get_k_fold_data( k=3, train_data=train_data, test_data=test_data, include_train_data=True, ) # 2. 将数据进行特征编码转换 feature_encoder = BowWordEmbeddingMergeCNN.get_feature_encoder( input_length=input_length, verbose=0, feature_type='word', ) cv_data = transform_cv_data(feature_encoder, cv_data, verbose=0) # 交叉验证 for bow_num_filter in bow_num_filter_list: for bow_region_size in bow_region_size_list: for w2v_num_filter in w2v_num_filter_list: print('=' * 40) print( 'bow_num_filter,bow_region_size and w2v_num_filter is %d,%d,%d.' % (bow_num_filter, bow_region_size, w2v_num_filter)) get_val_score( BowWordEmbeddingMergeCNNWithOneConv, cv_data=cv_data, verbose=verbose, bow_num_filter=bow_num_filter, bow_region_size=bow_region_size, w2v_num_filter=w2v_num_filter, num_labels=24, word2vec_model_file_path=word2vec_model_file_path, )
def cross_validation( train_data=None, test_data=None, cv_data=None, input_length =None, bow_num_filter_list=None, w2v_num_filter_list=None, bow_region_size_list = None, verbose = 0, word2vec_model_file_path = None, ): from data_processing_util.cross_validation_util import transform_cv_data, get_k_fold_data, get_val_score # 1. 获取交叉验证的数据 if cv_data is None: assert train_data is not None, 'cv_data和train_data必须至少提供一个!' cv_data = get_k_fold_data( k=3, train_data=train_data, test_data=test_data, include_train_data=True, ) # 2. 将数据进行特征编码转换 feature_encoder = BowWordEmbeddingMergeCNN.get_feature_encoder( input_length=input_length, verbose=0, feature_type='word', ) cv_data = transform_cv_data(feature_encoder, cv_data,verbose=0) # 交叉验证 for bow_num_filter in bow_num_filter_list: for bow_region_size in bow_region_size_list: for w2v_num_filter in w2v_num_filter_list: print('=' * 40) print('bow_num_filter,bow_region_size and w2v_num_filter is %d,%d,%d.'%(bow_num_filter,bow_region_size,w2v_num_filter)) get_val_score(BowWordEmbeddingMergeCNNWithOneConv, cv_data=cv_data, verbose=verbose, bow_num_filter = bow_num_filter, bow_region_size = bow_region_size, w2v_num_filter = w2v_num_filter, num_labels=24, word2vec_model_file_path = word2vec_model_file_path, )
def cross_validation( train_data=None, test_data=None, cv_data=None, input_length=None, conv1_num_filter_list=None, conv2_num_filter_list=None, verbose=0, word2vec_model_file_path=None, ): from data_processing_util.cross_validation_util import transform_cv_data, get_k_fold_data, get_val_score # 1. 获取交叉验证的数据 if cv_data is None: assert train_data is not None, 'cv_data和train_data必须至少提供一个!' cv_data = get_k_fold_data( k=3, train_data=train_data, test_data=test_data, include_train_data=True, ) # 2. 将数据进行特征编码转换 feature_encoder = DCNN.get_feature_encoder( input_length=input_length, verbose=0, full_mode=False, feature_type='word', ) cv_data = transform_cv_data(feature_encoder, cv_data, verbose=0) # 交叉验证 for conv1_num_filter in conv1_num_filter_list: for conv2_num_filter in conv2_num_filter_list: print('=' * 40) print('num_filter of conv1 and conv2 is %d,%d .' % (conv1_num_filter, conv2_num_filter)) get_val_score( DcnnAcl, cv_data=cv_data, verbose=verbose, conv1_num_filter=conv1_num_filter, conv2_num_filter=conv2_num_filter, num_labels=24, word2vec_model_file_path=word2vec_model_file_path, )
def cross_validation( train_data=None, test_data=None, cv_data=None, input_length=None, feature_type='word', num_filter_list=None, region_size_list=None, word2vec_to_solve_oov=False, word2vec_model_file_path=None, verbose=0, ): from data_processing_util.cross_validation_util import transform_cv_data, get_k_fold_data, get_val_score # 1. 获取交叉验证的数据 if cv_data is None: assert train_data is not None, 'cv_data和train_data必须至少提供一个!' cv_data = get_k_fold_data( k=3, train_data=train_data, test_data=test_data, include_train_data=True, ) # 2. 将数据进行特征编码转换 feature_encoder = OnehotBowCNN.get_feature_encoder( input_length=input_length, verbose=verbose, feature_type=feature_type, word2vec_to_solve_oov=word2vec_to_solve_oov, word2vec_model_file_path=word2vec_model_file_path, ) cv_data = transform_cv_data(feature_encoder, cv_data, verbose=0) # 交叉验证 for num_filter in num_filter_list: for region_size in region_size_list: print('=' * 40) print('num_filter and region_size is %d,%d.' % (num_filter, region_size)) get_val_score(OnehotCNNWithOneConv, cv_data=cv_data, verbose=verbose, region_size=region_size, num_filter=num_filter, num_labels=24)
def cross_validation( train_data=None, test_data=None, cv_data=None, input_length =None, feature_type = 'word', num_filter_list=None, region_size_list=None, word2vec_to_solve_oov = False, word2vec_model_file_path = None, verbose = 0, ): from data_processing_util.cross_validation_util import transform_cv_data, get_k_fold_data, get_val_score # 1. 获取交叉验证的数据 if cv_data is None: assert train_data is not None, 'cv_data和train_data必须至少提供一个!' cv_data = get_k_fold_data( k=3, train_data=train_data, test_data=test_data, include_train_data=True, ) # 2. 将数据进行特征编码转换 feature_encoder = OnehotBowCNN.get_feature_encoder( input_length=input_length, verbose=verbose, feature_type=feature_type, word2vec_to_solve_oov = word2vec_to_solve_oov, word2vec_model_file_path=word2vec_model_file_path, ) cv_data = transform_cv_data(feature_encoder, cv_data,verbose=0) # 交叉验证 for num_filter in num_filter_list: for region_size in region_size_list: print('=' * 40) print('num_filter and region_size is %d,%d.'%(num_filter,region_size)) get_val_score(OnehotCNNWithOneConv, cv_data=cv_data, verbose=verbose, region_size = region_size, num_filter=num_filter, num_labels=24 )
def cross_validation( train_data=None, test_data=None, cv_data=None, shuffle_data=True, n_estimators_list=None, feature_type='word', word2vec_to_solve_oov=False, word2vec_model_file_path=None, verbose=0, cv=3, need_transform_input=True, need_segmented=True, need_validation=True, include_train_data=True, ): """进行参数的交叉验证 Parameters ---------- word2vec_model_file_path : str word2vec模型路径 train_data : (array-like,array-like) 训练数据 (train_X,train_y) test_data : (array-like,array-like) 测试数据 (test_X,test_y) cv_data : array-like k份验证数据 word2vec_to_solve_oov : bool 是否使用 w2v 去替换 n_estimators_list : array-like 验证参数,随机森林棵树 feature_type : str 特征类型, only in ['word','seg','word_seg'] shuffle_data : bool 是否打乱数据 verbose : int 数值越大,输出越详细 cv:int 进行 cv 折验证 need_transform_input : bool 是否需要转换数据 need_segmented:bool 是否需要分词 include_train_data: 是否包含训练数据一样验证 need_validation: 是否要验证 """ from data_processing_util.cross_validation_util import transform_cv_data, get_k_fold_data, get_val_score # 1. 获取交叉验证的数据 if cv_data is None: assert train_data is not None, 'cv_data和train_data必须至少提供一个!' cv_data = get_k_fold_data( k=cv, train_data=train_data, test_data=test_data, include_train_data=include_train_data, ) # region 2. 将数据进行特征编码转换 if need_transform_input: feature_encoder = BowRandomForest.get_feature_encoder( verbose=verbose, need_segmented=need_segmented, feature_type=feature_type, word2vec_to_solve_oov=word2vec_to_solve_oov, word2vec_model_file_path=word2vec_model_file_path, ) # diff_train_val_feature_encoder=1 每次feature encoder 都不同 cv_data = transform_cv_data(feature_encoder, cv_data, verbose=verbose, diff_train_val_feature_encoder=1) else: if len(cv_data[0]) < 6: # cv_data 每项都需要 6项, 不够则补齐 cv_data = [item + [None] for item in cv_data] # endregion # region 3. 交叉验证 for n_estimators in n_estimators_list: print('=' * 40) print('n_estimators is %d.' % n_estimators) get_val_score( BowRandomForest, cv_data=cv_data[:], verbose=verbose, shuffle_data=shuffle_data, need_validation=need_validation, n_estimators=n_estimators, )
def cross_validation( train_data=None, test_data=None, cv_data=None, feature_type='word', input_length=None, num_filter_list=None, verbose=0, cv=3, batch_size=32, lr=1e-2, need_segmented=True, word2vec_model_file_path=None, num_labels=24, embedding_weight_trainable=False, # 获取中间层输出 get_cnn_middle_layer_output=False, middle_layer_output_file=None, rand_weight=False, need_validation=True, include_train_data=True, vocabulary_including_test_set=True, ): """ Parameters ---------- train_data : array-like 训练数据 (train_X, train_y)) test_data : array-like 测试数据 cv_data : array-like k份验证数据 input_length : int 输入长度 num_filter_list : array-like 验证参数,number of filters middle_layer_output_file : str 中间层输出到哪个文件 get_cnn_middle_layer_output : bool 是否获取中间层输出(#,False) num_labels: int 标签 batch_size : int batch size vocabulary_including_test_set: bool,default,True 字典是否包括测试集 include_train_data : bool 是否包含训练数据一样验证 need_validation: bool 是否要验证 embedding_weight_trainable : bool 切换 CNN(static-w2v) 和 CNN(non-static-w2v) rand_weight : bool 切换 CNN(rand) or CNN(static/non-static-w2v) feature_type : str 特征类型 verbose : int 数值越大,输出越详细 cv:int 进行 cv 折验证 need_segmented:bool 是否需要分词 word2vec_model_file_path Notes ---------- - 为了提高效率,默认设置 update_dictionary = False ,以保证feature encoder的字典一致,避免重复构造字典 - 同时设置 diff_train_val_feature_encoder=1 来保证训练集上和验证集上的feature encoder 不同,因为字典大小不同 Examples ---------- >>> train_x = ['你好', '测试句子', '我要买手机', '今天天气不错', '无聊'] >>> train_y = [1, 2, 3, 2, 3] >>> test_x = ['你好', '不错哟'] >>> test_y = [1, 2] >>> cv_x = [['你好', '无聊'], ['测试句子', '今天天气不错'], ['我要买手机']] >>> cv_y = [[1, 3], [2, 2], [3]] >>> WordEmbeddingCNNWithOneConv.cross_validation( >>> train_data = (train_x,train_y), >>> test_data=(test_x,test_y), >>> input_length=8, >>> num_filter_list=[5,50], >>> verbose=1, >>> word2vec_model_file_path = '/home/jdwang/PycharmProjects/nlp_util/data_processing_util/word2vec_util/vector/50dim/vector1000000_50dim.gem', >>> ) """ print('=' * 80) print('feature_type: %s, need_segmented: %s, vocabulary_including_test_set: %s' % (feature_type, need_segmented, vocabulary_including_test_set)) print('input_length: %d, num_labels: %d' % (input_length, num_labels)) print('lr: %f, batch_size: %d, rand_weight: %s, embedding_weight_trainable: %s' % (lr,batch_size, rand_weight, embedding_weight_trainable)) if not rand_weight: print('W2V model file_path: %s' % word2vec_model_file_path) print('=' * 80) from data_processing_util.cross_validation_util import transform_cv_data, get_k_fold_data, get_val_score # 1. 获取交叉验证的数据 if cv_data is None: assert train_data is not None, 'cv_data和train_data必须至少提供一个!' cv_data = get_k_fold_data( k=cv, train_data=train_data, test_data=test_data, include_train_data=include_train_data, ) # 2. 将数据进行特征编码转换 feature_encoder = WordEmbeddingCNN.get_feature_encoder( need_segmented=need_segmented, input_length=input_length, verbose=1, feature_type=feature_type, padding_mode='center', # 设置字典保持一致 update_dictionary=False, vocabulary_including_test_set=vocabulary_including_test_set, ) cv_data = transform_cv_data(feature_encoder, cv_data, verbose=verbose, diff_train_val_feature_encoder=1) # 交叉验证 for num_filter in num_filter_list: print('=' * 40) print('num_filter is %d.' % num_filter) _, _, middle_output_dev, middle_output_val = get_val_score( WordEmbeddingCNNWithOneConv, cv_data=cv_data[:], verbose=verbose, num_filter=num_filter, num_labels=num_labels, word2vec_model_file_path=word2vec_model_file_path, embedding_weight_trainable=embedding_weight_trainable, get_cnn_middle_layer_output=get_cnn_middle_layer_output, need_validation=need_validation, rand_weight=rand_weight, batch_size=batch_size, lr=lr, ) if get_cnn_middle_layer_output: # 保存结果 with open(middle_layer_output_file, 'w') as fout: # 保存中间结果 pickle.dump(cv_data, fout) pickle.dump(middle_output_dev, fout) pickle.dump(middle_output_val, fout)
def cross_validation( train_data=None, test_data=None, cv_data=None, feature_type='word', input_length=None, num_filter_list=None, verbose=0, cv=3, batch_size=32, lr=1e-2, need_segmented=True, word2vec_model_file_path=None, num_labels=24, embedding_weight_trainable=False, shuffle_data=True, rand_weight=False, need_validation=True, include_train_data=True, vocabulary_including_test_set=True, n_estimators_list=None, ): print('=' * 80) print( 'feature_type:%s,need_segmented:%s,vocabulary_including_test_set:%s' % (feature_type, need_segmented, vocabulary_including_test_set)) print('rand_weight:%s,embedding_weight_trainable:%s' % (rand_weight, embedding_weight_trainable)) print('=' * 80) from data_processing_util.cross_validation_util import transform_cv_data, get_k_fold_data, get_val_score # 1. 获取交叉验证的数据 if cv_data is None: assert train_data is not None, 'cv_data和train_data必须至少提供一个!' cv_data = get_k_fold_data( k=cv, train_data=train_data, test_data=test_data, include_train_data=include_train_data, ) # 2. 将数据进行特征编码转换 feature_encoder = WordEmbeddingCNN.get_feature_encoder( need_segmented=need_segmented, input_length=input_length, verbose=1, feature_type=feature_type, # 设置字典保持一致 update_dictionary=False, vocabulary_including_test_set=vocabulary_including_test_set, ) cv_data = transform_cv_data(feature_encoder, cv_data, verbose=verbose, diff_train_val_feature_encoder=1) # 交叉验证 for num_filter in num_filter_list: for n_estimators in n_estimators_list: print('=' * 40) print('num_filter and n_estimators is %d,%d.' % (num_filter, n_estimators)) get_val_score( RFAndRFAndWordEmbeddingCnnMerge, num_filter=num_filter, n_estimators=n_estimators, cv_data=cv_data[:], verbose=verbose, num_labels=num_labels, word2vec_model_file_path=word2vec_model_file_path, embedding_weight_trainable=embedding_weight_trainable, need_validation=need_validation, rand_weight=rand_weight, batch_size=batch_size, lr=lr, shuffle_data=shuffle_data, )
def cross_validation(cv_data, test_data, result_file_path, **kwargs): """ 进行参数的交叉验证 :param cv_data: k份训练数据 :type cv_data: array-like :param test_data: 测试数据 :type test_data: array-like :return: """ nb_epoch = kwargs['nb_epoch'] verbose = kwargs['verbose'] num_labels = kwargs['num_labels'] word_input_length, seg_input_length = 10, 7 remove_stopword = kwargs['remove_stopword'] word2vec_to_solve_oov = kwargs['word2vec_to_solve_oov'] rand_seed = kwargs['rand_seed'] l1_conv_filter_type = kwargs['l1_conv_filter_type'] l2_conv_filter_type = kwargs['l2_conv_filter_type'] k = kwargs['k'] lr = kwargs['lr'] use_layer = kwargs['use_layer'] layer1 = kwargs['layer1'] if kwargs.get('layer1', []) !=[] else [-1] layer2 = kwargs['layer2'] if kwargs.get('layer2', []) !=[] else [-1] hidden1 = kwargs['hidden1'] if kwargs.get('hidden1', []) !=[] else [-1] hidden2 = kwargs['hidden2'] if kwargs.get('hidden2', []) !=[] else [-1] # 详细结果保存到... detail_result_file_path = result_file_path fout = open(detail_result_file_path, 'w') print('=' * 150) print('调节的参数....') print('use_layer:%s'%use_layer) print('layer1:%s' % str(layer1)) print('layer2:%s' % str(layer2)) print('hidden1:%s' % str(hidden1)) print('hidden2:%s' % str(hidden2)) print('-' * 150) print('word_input_length:%d\nseg_input_length:%d' % (word_input_length, seg_input_length)) print('使用word2vec:%s\nremove_stopword:%s\nnb_epoch:%d\nrand_seed:%d' % ( word2vec_to_solve_oov, remove_stopword, nb_epoch, rand_seed)) print('l1_conv_filter_type:%s' % l1_conv_filter_type) print('l2_conv_filter_type:%s' % l2_conv_filter_type) print('k:%s' % k) print('=' * 150) fout.write('=' * 150 + '\n') fout.write('cv结果:\n') fout.write('lr:%f\nnb_epoch:%d\nrand_seed:%d\n' % (lr,nb_epoch, rand_seed)) fout.write('l1_conv_filter_type:%s\n' % l1_conv_filter_type) fout.write('l2_conv_filter_type:%s\n' % l2_conv_filter_type) fout.write('k:%s\n' % k) fout.write('=' * 150 + '\n') from data_processing_util.cross_validation_util import transform_cv_data,get_val_score word_feature_encoder,seg_feature_encoder = MultiChannelOnehotBowCNN.get_feature_encoder( ** {'word_input_length':word_input_length, 'seg_input_length':seg_input_length} ) all_cv_word_data = transform_cv_data(word_feature_encoder, cv_data, test_data, **kwargs) all_cv_seg_data = transform_cv_data(seg_feature_encoder, cv_data, test_data, **kwargs) cv_data = [([dev_word_X,dev_seg_X],dev_y,[val_word_X,val_seg_X],val_y,(word_feature_encoder,seg_feature_encoder)) for (dev_word_X, dev_y, val_word_X, val_y,word_feature_encoder),(dev_seg_X, dev_y, val_seg_X, val_y,seg_feature_encoder) in zip(all_cv_word_data,all_cv_seg_data)] # 交叉验证 parmater = product(layer1, layer2, hidden1, hidden2) for l1,l2,h1,h2 in parmater: fout.write('=' * 150 + '\n') fout.write('layer1:%d,layer2:%d,hidden1:%d,hidden2:%d\n' % (l1, l2, h1, h2)) print('layer1:%d,layer2:%d,hidden1:%d,hidden2:%d' % (l1,l2,h1,h2)) l1_conv_filter =[] if 'conv1' in use_layer: l1_conv_filter.extend([ [l1, l1_conv_filter_type[0][0], -1, l1_conv_filter_type[0][1], (0, 1), 0., 'relu', 'none'], [l1, l1_conv_filter_type[1][0], -1, l1_conv_filter_type[1][1], (0, 1), 0., 'relu', 'none'], [l1, l1_conv_filter_type[2][0], -1, l1_conv_filter_type[2][1], (0, 1), 0., 'relu', 'none'], ]) full_connected_layer_units = [] if 'hidden1' in use_layer: full_connected_layer_units.append([h1, 0., 'relu', 'none']) parm = {'l1_conv_filter_type':l1_conv_filter, 'full_connected_layer_units':full_connected_layer_units, 'num_labels':num_labels, 'verbose':verbose, 'nb_epoch':nb_epoch, 'lr':lr } get_val_score(MultiChannelOnehotBowCNN,cv_data,fout,**parm) fout.close()
def cross_validation( train_data=None, test_data=None, cv_data=None, shuffle_data=True, n_estimators_list=None, feature_type='word', word2vec_to_solve_oov=False, word2vec_model_file_path=None, verbose=0, cv=3, need_transform_input=True, need_segmented=True, need_validation=True, include_train_data=True, ): """进行参数的交叉验证 Parameters ---------- word2vec_model_file_path : str word2vec模型路径 train_data : (array-like,array-like) 训练数据 (train_X,train_y) test_data : (array-like,array-like) 测试数据 (test_X,test_y) cv_data : array-like k份验证数据 word2vec_to_solve_oov : bool 是否使用 w2v 去替换 n_estimators_list : array-like 验证参数,随机森林棵树 feature_type : str 特征类型, only in ['word','seg','word_seg'] shuffle_data : bool 是否打乱数据 verbose : int 数值越大,输出越详细 cv:int 进行 cv 折验证 need_transform_input : bool 是否需要转换数据 need_segmented:bool 是否需要分词 include_train_data: 是否包含训练数据一样验证 need_validation: 是否要验证 """ from data_processing_util.cross_validation_util import transform_cv_data, get_k_fold_data, get_val_score # 1. 获取交叉验证的数据 if cv_data is None: assert train_data is not None, 'cv_data和train_data必须至少提供一个!' cv_data = get_k_fold_data( k=cv, train_data=train_data, test_data=test_data, include_train_data=include_train_data, ) # region 2. 将数据进行特征编码转换 if need_transform_input: feature_encoder = BowRandomForest.get_feature_encoder( verbose=verbose, need_segmented=need_segmented, feature_type=feature_type, word2vec_to_solve_oov=word2vec_to_solve_oov, word2vec_model_file_path=word2vec_model_file_path, ) # diff_train_val_feature_encoder=1 每次feature encoder 都不同 cv_data = transform_cv_data(feature_encoder, cv_data, verbose=verbose, diff_train_val_feature_encoder=1) else: if len(cv_data[0]) < 6: # cv_data 每项都需要 6项, 不够则补齐 cv_data = [item + [None] for item in cv_data] # endregion # region 3. 交叉验证 for n_estimators in n_estimators_list: print('=' * 40) print('n_estimators is %d.' % n_estimators) get_val_score(BowRandomForest, cv_data=cv_data[:], verbose=verbose, shuffle_data=shuffle_data, need_validation=need_validation, n_estimators=n_estimators, )
def cross_validation( train_data=None, test_data=None, cv_data=None, feature_type='word', input_length=None, num_filter_list=None, verbose=0, cv=3, batch_size=32, lr=1e-2, need_segmented=True, word2vec_model_file_path=None, num_labels=24, embedding_weight_trainable=False, # 获取中间层输出 get_cnn_middle_layer_output=False, middle_layer_output_file=None, rand_weight=False, need_validation=True, include_train_data=True, vocabulary_including_test_set=True, ): """ Parameters ---------- train_data : array-like 训练数据 (train_X, train_y)) test_data : array-like 测试数据 cv_data : array-like k份验证数据 input_length : int 输入长度 num_filter_list : array-like 验证参数,number of filters middle_layer_output_file : str 中间层输出到哪个文件 get_cnn_middle_layer_output : bool 是否获取中间层输出(#,False) num_labels: int 标签 batch_size : int batch size vocabulary_including_test_set: bool,default,True 字典是否包括测试集 include_train_data : bool 是否包含训练数据一样验证 need_validation: bool 是否要验证 embedding_weight_trainable : bool 切换 CNN(static-w2v) 和 CNN(non-static-w2v) rand_weight : bool 切换 CNN(rand) or CNN(static/non-static-w2v) feature_type : str 特征类型 verbose : int 数值越大,输出越详细 cv:int 进行 cv 折验证 need_segmented:bool 是否需要分词 word2vec_model_file_path Notes ---------- - 为了提高效率,默认设置 update_dictionary = False ,以保证feature encoder的字典一致,避免重复构造字典 - 同时设置 diff_train_val_feature_encoder=1 来保证训练集上和验证集上的feature encoder 不同,因为字典大小不同 Examples ---------- >>> train_x = ['你好', '测试句子', '我要买手机', '今天天气不错', '无聊'] >>> train_y = [1, 2, 3, 2, 3] >>> test_x = ['你好', '不错哟'] >>> test_y = [1, 2] >>> cv_x = [['你好', '无聊'], ['测试句子', '今天天气不错'], ['我要买手机']] >>> cv_y = [[1, 3], [2, 2], [3]] >>> WordEmbeddingCNNWithOneConv.cross_validation( >>> train_data = (train_x,train_y), >>> test_data=(test_x,test_y), >>> input_length=8, >>> num_filter_list=[5,50], >>> verbose=1, >>> word2vec_model_file_path = '/home/jdwang/PycharmProjects/nlp_util/data_processing_util/word2vec_util/vector/50dim/vector1000000_50dim.gem', >>> ) """ print('=' * 80) print( 'feature_type: %s, need_segmented: %s, vocabulary_including_test_set: %s' % (feature_type, need_segmented, vocabulary_including_test_set)) print('input_length: %d, num_labels: %d' % (input_length, num_labels)) print( 'lr: %f, batch_size: %d, rand_weight: %s, embedding_weight_trainable: %s' % (lr, batch_size, rand_weight, embedding_weight_trainable)) if not rand_weight: print('W2V model file_path: %s' % word2vec_model_file_path) print('=' * 80) from data_processing_util.cross_validation_util import transform_cv_data, get_k_fold_data, get_val_score # 1. 获取交叉验证的数据 if cv_data is None: assert train_data is not None, 'cv_data和train_data必须至少提供一个!' cv_data = get_k_fold_data( k=cv, train_data=train_data, test_data=test_data, include_train_data=include_train_data, ) # 2. 将数据进行特征编码转换 feature_encoder = WordEmbeddingCNN.get_feature_encoder( need_segmented=need_segmented, input_length=input_length, verbose=1, feature_type=feature_type, padding_mode='center', # 设置字典保持一致 update_dictionary=False, vocabulary_including_test_set=vocabulary_including_test_set, ) cv_data = transform_cv_data(feature_encoder, cv_data, verbose=verbose, diff_train_val_feature_encoder=1) # 交叉验证 for num_filter in num_filter_list: print('=' * 40) print('num_filter is %d.' % num_filter) _, _, middle_output_dev, middle_output_val = get_val_score( WordEmbeddingCNNWithOneConv, cv_data=cv_data[:], verbose=verbose, num_filter=num_filter, num_labels=num_labels, word2vec_model_file_path=word2vec_model_file_path, embedding_weight_trainable=embedding_weight_trainable, get_cnn_middle_layer_output=get_cnn_middle_layer_output, need_validation=need_validation, rand_weight=rand_weight, batch_size=batch_size, lr=lr, ) if get_cnn_middle_layer_output: # 保存结果 with open(middle_layer_output_file, 'w') as fout: # 保存中间结果 pickle.dump(cv_data, fout) pickle.dump(middle_output_dev, fout) pickle.dump(middle_output_val, fout)
def cross_validation( train_data=None, test_data=None, cv_data=None, feature_type='word', input_length=None, num_filter_list=None, verbose=0, cv=3, batch_size=32, lr=1e-2, need_segmented=True, word2vec_model_file_path=None, num_labels=24, embedding_weight_trainable=False, shuffle_data=True, rand_weight=False, need_validation=True, include_train_data=True, vocabulary_including_test_set=True, n_estimators_list=None, ): print('=' * 80) print('feature_type:%s,need_segmented:%s,vocabulary_including_test_set:%s' % (feature_type, need_segmented, vocabulary_including_test_set)) print('rand_weight:%s,embedding_weight_trainable:%s' % (rand_weight, embedding_weight_trainable)) print('=' * 80) from data_processing_util.cross_validation_util import transform_cv_data, get_k_fold_data, get_val_score # 1. 获取交叉验证的数据 if cv_data is None: assert train_data is not None, 'cv_data和train_data必须至少提供一个!' cv_data = get_k_fold_data( k=cv, train_data=train_data, test_data=test_data, include_train_data=include_train_data, ) # 2. 将数据进行特征编码转换 feature_encoder = RFAndWordEmbeddingCnnMerge.get_feature_encoder( need_segmented=need_segmented, input_length=input_length, verbose=1, feature_type=feature_type, # 设置字典保持一致 update_dictionary=False, vocabulary_including_test_set=vocabulary_including_test_set, ) # 转换数据 # diff_train_val_feature_encoder=1 ----> 训练集和验证集的 feature_encoder 字典 强制不一样。 cv_data = transform_cv_data(feature_encoder, cv_data, verbose=verbose, shuffle_data=shuffle_data, diff_train_val_feature_encoder=1) # 交叉验证 for num_filter in num_filter_list: for n_estimators in n_estimators_list: print('=' * 40) print('num_filter and n_estimators is %d,%d.' % (num_filter, n_estimators)) get_val_score(RFAndRFAndWordEmbeddingCnnMerge, num_filter=num_filter, n_estimators=n_estimators, cv_data=cv_data[:], verbose=verbose, num_labels=num_labels, word2vec_model_file_path=word2vec_model_file_path, embedding_weight_trainable=embedding_weight_trainable, need_validation=need_validation, rand_weight=rand_weight, batch_size=batch_size, lr=lr, )