# print(result_file_path) w2v_embedding_cnn = WordEmbeddingCNN( rand_seed=seed, verbose=1, optimizers=config['optimizers'], input_dim=init_weight.shape[0], word_embedding_dim=word_embedding_dim, input_length=sentence_padding_length, num_labels=len(label_to_index), l1_conv_filter_type= [[layer1, 2, 10, 'valid', [2, 1], 0.], [layer1, 4, 10, 'valid', [2, 1], 0.], [layer1, 6, 10, 'valid', [2, 1], 0.], ], l2_conv_filter_type= [ [layer2, 3, 1, 'valid', [2, 1], 0.25] ], full_connected_layer_units=[[hidden1,0.5],[hidden2,0.5]], embedding_dropout_rate=0., nb_epoch=30, earlyStoping_patience=config['earlyStoping_patience'], lr = config['lr'], batch_size = batch_size, embedding_weight_trainable = True, embedding_init_weight=init_weight, ) print (w2v_embedding_cnn.embedding_layer_output.get_weights()[0][1]) w2v_embedding_cnn.print_model_descibe()
def cross_validation( train_data=None, test_data=None, cv_data=None, feature_type='word', input_length=None, num_filter_list=None, verbose=0, cv=3, batch_size=32, lr=1e-2, need_segmented=True, word2vec_model_file_path=None, num_labels=24, embedding_weight_trainable=False, # 获取中间层输出 get_cnn_middle_layer_output=False, middle_layer_output_file=None, rand_weight=False, need_validation=True, include_train_data=True, vocabulary_including_test_set=True, ): """ Parameters ---------- train_data : array-like 训练数据 (train_X, train_y)) test_data : array-like 测试数据 cv_data : array-like k份验证数据 input_length : int 输入长度 num_filter_list : array-like 验证参数,number of filters middle_layer_output_file : str 中间层输出到哪个文件 get_cnn_middle_layer_output : bool 是否获取中间层输出(#,False) num_labels: int 标签 batch_size : int batch size vocabulary_including_test_set: bool,default,True 字典是否包括测试集 include_train_data : bool 是否包含训练数据一样验证 need_validation: bool 是否要验证 embedding_weight_trainable : bool 切换 CNN(static-w2v) 和 CNN(non-static-w2v) rand_weight : bool 切换 CNN(rand) or CNN(static/non-static-w2v) feature_type : str 特征类型 verbose : int 数值越大,输出越详细 cv:int 进行 cv 折验证 need_segmented:bool 是否需要分词 word2vec_model_file_path Notes ---------- - 为了提高效率,默认设置 update_dictionary = False ,以保证feature encoder的字典一致,避免重复构造字典 - 同时设置 diff_train_val_feature_encoder=1 来保证训练集上和验证集上的feature encoder 不同,因为字典大小不同 Examples ---------- >>> train_x = ['你好', '测试句子', '我要买手机', '今天天气不错', '无聊'] >>> train_y = [1, 2, 3, 2, 3] >>> test_x = ['你好', '不错哟'] >>> test_y = [1, 2] >>> cv_x = [['你好', '无聊'], ['测试句子', '今天天气不错'], ['我要买手机']] >>> cv_y = [[1, 3], [2, 2], [3]] >>> WordEmbeddingCNNWithOneConv.cross_validation( >>> train_data = (train_x,train_y), >>> test_data=(test_x,test_y), >>> input_length=8, >>> num_filter_list=[5,50], >>> verbose=1, >>> word2vec_model_file_path = '/home/jdwang/PycharmProjects/nlp_util/data_processing_util/word2vec_util/vector/50dim/vector1000000_50dim.gem', >>> ) """ print('=' * 80) print('feature_type: %s, need_segmented: %s, vocabulary_including_test_set: %s' % (feature_type, need_segmented, vocabulary_including_test_set)) print('input_length: %d, num_labels: %d' % (input_length, num_labels)) print('lr: %f, batch_size: %d, rand_weight: %s, embedding_weight_trainable: %s' % (lr,batch_size, rand_weight, embedding_weight_trainable)) if not rand_weight: print('W2V model file_path: %s' % word2vec_model_file_path) print('=' * 80) from data_processing_util.cross_validation_util import transform_cv_data, get_k_fold_data, get_val_score # 1. 获取交叉验证的数据 if cv_data is None: assert train_data is not None, 'cv_data和train_data必须至少提供一个!' cv_data = get_k_fold_data( k=cv, train_data=train_data, test_data=test_data, include_train_data=include_train_data, ) # 2. 将数据进行特征编码转换 feature_encoder = WordEmbeddingCNN.get_feature_encoder( need_segmented=need_segmented, input_length=input_length, verbose=1, feature_type=feature_type, padding_mode='center', # 设置字典保持一致 update_dictionary=False, vocabulary_including_test_set=vocabulary_including_test_set, ) cv_data = transform_cv_data(feature_encoder, cv_data, verbose=verbose, diff_train_val_feature_encoder=1) # 交叉验证 for num_filter in num_filter_list: print('=' * 40) print('num_filter is %d.' % num_filter) _, _, middle_output_dev, middle_output_val = get_val_score( WordEmbeddingCNNWithOneConv, cv_data=cv_data[:], verbose=verbose, num_filter=num_filter, num_labels=num_labels, word2vec_model_file_path=word2vec_model_file_path, embedding_weight_trainable=embedding_weight_trainable, get_cnn_middle_layer_output=get_cnn_middle_layer_output, need_validation=need_validation, rand_weight=rand_weight, batch_size=batch_size, lr=lr, ) if get_cnn_middle_layer_output: # 保存结果 with open(middle_layer_output_file, 'w') as fout: # 保存中间结果 pickle.dump(cv_data, fout) pickle.dump(middle_output_dev, fout) pickle.dump(middle_output_val, fout)
def cross_validation( train_data=None, test_data=None, cv_data=None, feature_type='word', input_length=None, num_filter_list=None, verbose=0, cv=3, batch_size=32, lr=1e-2, need_segmented=True, word2vec_model_file_path=None, num_labels=24, embedding_weight_trainable=False, shuffle_data=True, rand_weight=False, need_validation=True, include_train_data=True, vocabulary_including_test_set=True, n_estimators_list=None, ): print('=' * 80) print( 'feature_type:%s,need_segmented:%s,vocabulary_including_test_set:%s' % (feature_type, need_segmented, vocabulary_including_test_set)) print('rand_weight:%s,embedding_weight_trainable:%s' % (rand_weight, embedding_weight_trainable)) print('=' * 80) from data_processing_util.cross_validation_util import transform_cv_data, get_k_fold_data, get_val_score # 1. 获取交叉验证的数据 if cv_data is None: assert train_data is not None, 'cv_data和train_data必须至少提供一个!' cv_data = get_k_fold_data( k=cv, train_data=train_data, test_data=test_data, include_train_data=include_train_data, ) # 2. 将数据进行特征编码转换 feature_encoder = WordEmbeddingCNN.get_feature_encoder( need_segmented=need_segmented, input_length=input_length, verbose=1, feature_type=feature_type, # 设置字典保持一致 update_dictionary=False, vocabulary_including_test_set=vocabulary_including_test_set, ) cv_data = transform_cv_data(feature_encoder, cv_data, verbose=verbose, diff_train_val_feature_encoder=1) # 交叉验证 for num_filter in num_filter_list: for n_estimators in n_estimators_list: print('=' * 40) print('num_filter and n_estimators is %d,%d.' % (num_filter, n_estimators)) get_val_score( RFAndRFAndWordEmbeddingCnnMerge, num_filter=num_filter, n_estimators=n_estimators, cv_data=cv_data[:], verbose=verbose, num_labels=num_labels, word2vec_model_file_path=word2vec_model_file_path, embedding_weight_trainable=embedding_weight_trainable, need_validation=need_validation, rand_weight=rand_weight, batch_size=batch_size, lr=lr, shuffle_data=shuffle_data, )
def __init__(self, feature_encoder, num_filter, num_labels, n_estimators, word2vec_model_file_path, **kwargs): if kwargs.get('rand_weight', False): # CNN(rand)模式 weight = None elif kwargs['dataset_flag'] == 0: if RFAndWordEmbeddingCnnMerge.train_data_weight is None: # 训练集 RFAndWordEmbeddingCnnMerge.train_data_weight = feature_encoder.to_embedding_weight( word2vec_model_file_path) weight = RFAndWordEmbeddingCnnMerge.train_data_weight else: # kwargs['dataset_flag']>0 if RFAndWordEmbeddingCnnMerge.val_data_weight is None: RFAndWordEmbeddingCnnMerge.val_data_weight = feature_encoder.to_embedding_weight( word2vec_model_file_path) weight = RFAndWordEmbeddingCnnMerge.val_data_weight # print(weight) self.static_w2v_cnn = WordEmbeddingCNN( rand_seed=1377, verbose=kwargs.get('verbose', 0), feature_encoder=feature_encoder, # optimizers='adadelta', optimizers='sgd', # 当使用CNN (rand) 模式的时候使用到了 word_embedding_dim=50, # 设置embedding使用训练好的w2v模型初始化 embedding_init_weight=weight, # 默认设置为训练时embedding层权重不变 embedding_weight_trainable=kwargs.get('embedding_weight_trainable', False), num_labels=num_labels, l1_conv_filter_type=[ [num_filter, 3, -1, 'valid', (-1, 1), 0.5, 'relu', 'none'], [num_filter, 4, -1, 'valid', (-1, 1), 0., 'relu', 'none'], [num_filter, 5, -1, 'valid', (-1, 1), 0., 'relu', 'none'], ], l2_conv_filter_type=[], full_connected_layer_units=[], embedding_dropout_rate=0., nb_epoch=kwargs.get('nb_epoch', 25), batch_size=kwargs.get('batch_size', 32), earlyStoping_patience=30, lr=kwargs.get('lr', 1e-2), show_validate_accuracy=True if kwargs.get('verbose', 0) > 0 else False, # output_regularizer=('l2', 0.5), output_constraints=('maxnorm', 3), # 必须设为True,才能取中间结果做特征 save_middle_output=True, ) self.bow_randomforest = BowRandomForest( rand_seed=1377, verbose=kwargs.get('verbose', 0), feature_encoder=feature_encoder, # optimizers='adadelta', n_estimators=n_estimators, min_samples_leaf=1, )
test_cnn_feature_file_path = test_cnn_feature_file_path % seed print model_file_path print result_file_path print train_cnn_feature_file_path print test_cnn_feature_file_path rand_embedding_cnn = WordEmbeddingCNN( rand_seed=seed, verbose=verbose, optimizers=config['optimizers'], input_dim=feature_encoder.vocabulary_size + 1, word_embedding_dim=config['word_embedding_dim'], input_length=config['sentence_padding_length'], num_labels=len(label_to_index), conv_filter_type=config['conv_filter_type'], k=config['kmax_k'], embedding_dropout_rate=config['embedding_dropout_rate'], output_dropout_rate=config['output_dropout_rate'], nb_epoch=int(config['cnn_nb_epoch']), earlyStoping_patience=config['earlyStoping_patience'], ) rand_embedding_cnn.print_model_descibe() if config['refresh_all_model'] or not os.path.exists(model_file_path): # 训练模型 rand_embedding_cnn.fit((train_X_feature, train_y), (test_X_feature, test_y)) # 保存模型 rand_embedding_cnn.save_model(model_file_path)
label_to_index,index_to_label = data_util.get_label_index(version=config['label_version']) # **************************************************************** # ------------- region end : 1. 加载训练数据和测试数据 ------------- # **************************************************************** # **************************************************************** # +++++++++++++ region start : 2. 转换数据的格式并特征编码 +++++++++++++ # **************************************************************** logging.debug('=' * 20) logging.debug('2. 转换数据的格式并特征编码') from data_processing_util.cross_validation_util import transform_cv_data from deep_learning.cnn.wordEmbedding_cnn.wordEmbedding_cnn_model import WordEmbeddingCNN feature_encoder = WordEmbeddingCNN.get_feature_encoder( **{'input_length': input_length, 'feature_type':feature_type,} ) train_X_feature = feature_encoder.fit_transform(train_data=train_data['SENTENCE'].as_matrix()) feature_encoder.print_model_descibe() feature_encoder.print_sentence_length_detail() # train_y = train_data['LABEL_INDEX'].as_matrix() test_all_X_feature = feature_encoder.transform(test_data['SENTENCE'].as_matrix()) test_all_y = test_data['LABEL_INDEX'].as_matrix() print(train_data['LABEL_INDEX'].as_matrix()) print(train_X_feature.shape) print(test_all_X_feature.shape)
word2vec_file_path = config['word2vec_model_file_path'] word2vec_file_path = word2vec_file_path%config['word_embedding_dim'] print(model_file_path) print(word2vec_file_path) for seed in [10,100,500,1337,2000,300]: w2v_embedding_cnn = WordEmbeddingCNN( rand_seed=seed, verbose=verbose, input_dim=feature_encoder.vocabulary_size + 1, word_embedding_dim=config['word_embedding_dim'], embedding_init_weight=feature_encoder.to_embedding_weight(word2vec_file_path), input_length=config['padding_length'], num_labels=len(label_to_index), conv_filter_type=config['conv_filter_type'], k=config['kmax_k'], embedding_dropout_rate=config['embedding_dropout_rate'], output_dropout_rate=config['output_dropout_rate'], nb_epoch=int(config['cnn_nb_epoch']), earlyStoping_patience=config['earlyStoping_patience'], feature_encoder=feature_encoder.vocabulary_size+1, optimizers='sgd', lr= 1e-1, batch_size = 128, ) w2v_embedding_cnn.print_model_descibe() if config['refresh_all_model'] or not os.path.exists(model_file_path): # 训练模型 w2v_embedding_cnn.fit((train_w2v_features, train_y), (test_w2v_features, test_y))
result_file_path = "".join([str(item) for item in config["result_file_path"]]) result_file_path = result_file_path % seed print model_file_path print result_file_path print "rand seed:%d" % seed rand_embedding_cnn = WordEmbeddingCNN( rand_seed=seed, verbose=1, optimizers=config["optimizers"], input_dim=feature_encoder.vocabulary_size, word_embedding_dim=config["word_embedding_dim"], input_length=sentence_padding_length, num_labels=len(label_to_index), l1_conv_filter_type=config["l1_conv_filter_type"], l2_conv_filter_type=config["l2_conv_filter_type"], full_connected_layer_units=config["full_connected_layer_units"], embedding_dropout_rate=config["embedding_dropout_rate"], nb_epoch=int(config["cnn_nb_epoch"]), earlyStoping_patience=config["earlyStoping_patience"], lr=config["lr"], batch_size=config["batch_size"], embedding_weight_trainable=True, ) rand_embedding_cnn.print_model_descibe() if config["refresh_all_model"] or not os.path.exists(model_file_path): print ("+" * 80) # 训练模型
def get_model(feature_encoder, num_filter, num_labels, word2vec_model_file_path, **kwargs): # print(WordEmbeddingCNNWithOneConv.weight) """获取 CNN(w2v)模型 Parameters ---------- feature_encoder : FeatureEncoder 特征编码器 num_filter : int num_labels : int word2vec_model_file_path : str kwargs : dict - dataset_flag - rand_weight : (default,False)设置为 True 时,为 CNN(rand) 模型 - verbose - embedding_weight_trainable Returns ------- """ if kwargs.get('rand_weight', False): # CNN(rand)模式 weight = None elif kwargs['dataset_flag'] == 0: if WordEmbeddingCNNWithOneConv.train_data_weight is None: # 训练集 WordEmbeddingCNNWithOneConv.train_data_weight = feature_encoder.to_embedding_weight( word2vec_model_file_path) weight = WordEmbeddingCNNWithOneConv.train_data_weight else: # kwargs['dataset_flag']>0 if WordEmbeddingCNNWithOneConv.val_data_weight is None: WordEmbeddingCNNWithOneConv.val_data_weight = feature_encoder.to_embedding_weight( word2vec_model_file_path) weight = WordEmbeddingCNNWithOneConv.val_data_weight # print(weight) static_w2v_cnn = WordEmbeddingCNN( rand_seed=1377, verbose=kwargs.get('verbose', 0), feature_encoder=feature_encoder, # optimizers='adadelta', optimizers='sgd', # 当使用CNN (rand) 模式的时候使用到了 word_embedding_dim=300, # 设置embedding使用训练好的w2v模型初始化 embedding_init_weight=weight, # 默认设置为训练时embedding层权重不变 embedding_weight_trainable=kwargs.get('embedding_weight_trainable', False), num_labels=num_labels, l1_conv_filter_type=[ [num_filter, 3, -1, 'valid', (-1, 1), 0.5, 'relu', 'none'], [num_filter, 4, -1, 'valid', (-1, 1), 0., 'relu', 'none'], [num_filter, 5, -1, 'valid', (-1, 1), 0., 'relu', 'none'], ], l2_conv_filter_type=[], full_connected_layer_units=[], embedding_dropout_rate=0., nb_epoch=kwargs.get('nb_epoch', 25), batch_size=kwargs.get('batch_size', 32), earlyStoping_patience=30, lr=kwargs.get('lr', 1e-2), show_validate_accuracy=True if kwargs.get('verbose', 0) > 0 else False, # output_regularizer=('l2', 0.5), output_constraints=('maxnorm', 3), save_middle_output=kwargs.get('get_cnn_middle_layer_output', False), ) # static_w2v_cnn.print_model_descibe() # quit() return static_w2v_cnn
word2vec_file_path = (config['word2vec_file_path'])%config['word_embedding_dim'] print model_file_path print result_file_path print train_cnn_feature_file_path print test_cnn_feature_file_path print word2vec_file_path rand_embedding_cnn = WordEmbeddingCNN( rand_seed=seed, verbose=verbose, input_dim=feature_encoder.vocabulary_size + 1, word_embedding_dim=config['word_embedding_dim'], embedding_init_weight=feature_encoder.to_embedding_weight(word2vec_file_path), input_length=config['sentence_padding_length'], num_labels=len(label_to_index), conv_filter_type=config['conv_filter_type'], k=config['kmax_k'], embedding_dropout_rate=config['embedding_dropout_rate'], output_dropout_rate=config['output_dropout_rate'], nb_epoch=int(config['cnn_nb_epoch']), earlyStoping_patience=config['earlyStoping_patience'], ) rand_embedding_cnn.print_model_descibe() if config['refresh_all_model'] or not os.path.exists(model_file_path): # 训练模型 rand_embedding_cnn.fit((feature_encoder.train_padding_index, train_y), (map(feature_encoder.transform_sentence, test_X), test_y)) # 保存模型 rand_embedding_cnn.save_model(model_file_path)
def cross_validation( train_data=None, test_data=None, cv_data=None, feature_type='word', input_length=None, num_filter_list=None, verbose=0, cv=3, batch_size=32, lr=1e-2, need_segmented=True, word2vec_model_file_path=None, num_labels=24, embedding_weight_trainable=False, # 获取中间层输出 get_cnn_middle_layer_output=False, middle_layer_output_file=None, rand_weight=False, need_validation=True, include_train_data=True, vocabulary_including_test_set=True, ): """ Parameters ---------- train_data : array-like 训练数据 (train_X, train_y)) test_data : array-like 测试数据 cv_data : array-like k份验证数据 input_length : int 输入长度 num_filter_list : array-like 验证参数,number of filters middle_layer_output_file : str 中间层输出到哪个文件 get_cnn_middle_layer_output : bool 是否获取中间层输出(#,False) num_labels: int 标签 batch_size : int batch size vocabulary_including_test_set: bool,default,True 字典是否包括测试集 include_train_data : bool 是否包含训练数据一样验证 need_validation: bool 是否要验证 embedding_weight_trainable : bool 切换 CNN(static-w2v) 和 CNN(non-static-w2v) rand_weight : bool 切换 CNN(rand) or CNN(static/non-static-w2v) feature_type : str 特征类型 verbose : int 数值越大,输出越详细 cv:int 进行 cv 折验证 need_segmented:bool 是否需要分词 word2vec_model_file_path Notes ---------- - 为了提高效率,默认设置 update_dictionary = False ,以保证feature encoder的字典一致,避免重复构造字典 - 同时设置 diff_train_val_feature_encoder=1 来保证训练集上和验证集上的feature encoder 不同,因为字典大小不同 Examples ---------- >>> train_x = ['你好', '测试句子', '我要买手机', '今天天气不错', '无聊'] >>> train_y = [1, 2, 3, 2, 3] >>> test_x = ['你好', '不错哟'] >>> test_y = [1, 2] >>> cv_x = [['你好', '无聊'], ['测试句子', '今天天气不错'], ['我要买手机']] >>> cv_y = [[1, 3], [2, 2], [3]] >>> WordEmbeddingCNNWithOneConv.cross_validation( >>> train_data = (train_x,train_y), >>> test_data=(test_x,test_y), >>> input_length=8, >>> num_filter_list=[5,50], >>> verbose=1, >>> word2vec_model_file_path = '/home/jdwang/PycharmProjects/nlp_util/data_processing_util/word2vec_util/vector/50dim/vector1000000_50dim.gem', >>> ) """ print('=' * 80) print( 'feature_type: %s, need_segmented: %s, vocabulary_including_test_set: %s' % (feature_type, need_segmented, vocabulary_including_test_set)) print('input_length: %d, num_labels: %d' % (input_length, num_labels)) print( 'lr: %f, batch_size: %d, rand_weight: %s, embedding_weight_trainable: %s' % (lr, batch_size, rand_weight, embedding_weight_trainable)) if not rand_weight: print('W2V model file_path: %s' % word2vec_model_file_path) print('=' * 80) from data_processing_util.cross_validation_util import transform_cv_data, get_k_fold_data, get_val_score # 1. 获取交叉验证的数据 if cv_data is None: assert train_data is not None, 'cv_data和train_data必须至少提供一个!' cv_data = get_k_fold_data( k=cv, train_data=train_data, test_data=test_data, include_train_data=include_train_data, ) # 2. 将数据进行特征编码转换 feature_encoder = WordEmbeddingCNN.get_feature_encoder( need_segmented=need_segmented, input_length=input_length, verbose=1, feature_type=feature_type, padding_mode='center', # 设置字典保持一致 update_dictionary=False, vocabulary_including_test_set=vocabulary_including_test_set, ) cv_data = transform_cv_data(feature_encoder, cv_data, verbose=verbose, diff_train_val_feature_encoder=1) # 交叉验证 for num_filter in num_filter_list: print('=' * 40) print('num_filter is %d.' % num_filter) _, _, middle_output_dev, middle_output_val = get_val_score( WordEmbeddingCNNWithOneConv, cv_data=cv_data[:], verbose=verbose, num_filter=num_filter, num_labels=num_labels, word2vec_model_file_path=word2vec_model_file_path, embedding_weight_trainable=embedding_weight_trainable, get_cnn_middle_layer_output=get_cnn_middle_layer_output, need_validation=need_validation, rand_weight=rand_weight, batch_size=batch_size, lr=lr, ) if get_cnn_middle_layer_output: # 保存结果 with open(middle_layer_output_file, 'w') as fout: # 保存中间结果 pickle.dump(cv_data, fout) pickle.dump(middle_output_dev, fout) pickle.dump(middle_output_val, fout)
class RFAndWordEmbeddingCnnMerge(CnnBaseClass): __version__ = '1.4' # 如果使用全体数据作为字典,则使用这个变量来存放权重,避免重复加载权重,因为每次加载的权重都是一样的。 train_data_weight = None # 验证数据是一份权重,不包含测试集了 val_data_weight = None def __init__(self, feature_encoder, num_filter, num_labels, n_estimators, word2vec_model_file_path, **kwargs): self.static_w2v_cnn = None self.bow_randomforest = None self.feature_encoder = feature_encoder if not kwargs.get('init_model', True): # 不初始化模型,一般在恢复模型时候用 return if kwargs.get('rand_weight', False): # CNN(rand)模式 weight = None elif kwargs['dataset_flag'] == 0: # 训练集 if RFAndWordEmbeddingCnnMerge.train_data_weight is None: # 训练集 RFAndWordEmbeddingCnnMerge.train_data_weight = feature_encoder.to_embedding_weight( word2vec_model_file_path) weight = RFAndWordEmbeddingCnnMerge.train_data_weight else: # kwargs['dataset_flag']>0 if RFAndWordEmbeddingCnnMerge.val_data_weight is None: RFAndWordEmbeddingCnnMerge.val_data_weight = feature_encoder.to_embedding_weight( word2vec_model_file_path) weight = RFAndWordEmbeddingCnnMerge.val_data_weight # print(weight) self.static_w2v_cnn = WordEmbeddingCNN( rand_seed=1377, verbose=kwargs.get('verbose', 0), feature_encoder=feature_encoder, # optimizers='adadelta', optimizers='sgd', # 当使用CNN (rand) 模式的时候使用到了 word_embedding_dim=50, # 设置embedding使用训练好的w2v模型初始化 embedding_init_weight=weight, # 默认设置为训练时embedding层权重不变 embedding_weight_trainable=kwargs.get('embedding_weight_trainable', False), num_labels=num_labels, l1_conv_filter_type=[ [num_filter, 3, -1, 'valid', (-1, 1), 0.5, 'relu', 'none'], [num_filter, 4, -1, 'valid', (-1, 1), 0., 'relu', 'none'], [num_filter, 5, -1, 'valid', (-1, 1), 0., 'relu', 'none'], ], l2_conv_filter_type=[], full_connected_layer_units=[], embedding_dropout_rate=0., nb_epoch=kwargs.get('nb_epoch', 25), batch_size=kwargs.get('batch_size', 32), earlyStoping_patience=30, lr=kwargs.get('lr', 1e-2), show_validate_accuracy=True if kwargs.get('verbose', 0) > 0 else False, # output_regularizer=('l2', 0.5), output_constraints=('maxnorm', 3), # 必须设为True,才能取中间结果做特征 save_middle_output=True, ) self.bow_randomforest = BowRandomForest( rand_seed=1377, verbose=kwargs.get('verbose', 0), feature_encoder=feature_encoder, # optimizers='adadelta', n_estimators=n_estimators, min_samples_leaf=1, ) def fit(self, train_data=None, validation_data=None): train_X, train_y = train_data validation_X, validation_y = validation_data self.static_w2v_cnn.fit(train_data, validation_data) train_x_features = self.static_w2v_cnn.get_layer_output(train_X)[4] validation_x_features = self.static_w2v_cnn.get_layer_output( validation_X)[4] return self.bow_randomforest.fit((train_x_features, train_y), (validation_x_features, validation_y)) def save_model(self, path): """ 保存模型,保存成pickle形式 :param path: 模型保存的路径 :type path: 模型保存的路径 :return: """ model_file = open(path, 'wb') pickle.dump(self.feature_encoder, model_file) pickle.dump(self.static_w2v_cnn, model_file) pickle.dump(self.bow_randomforest, model_file) def model_from_pickle(self, path): ''' 从模型文件中直接加载模型 :param path: :return: RandEmbeddingCNN object ''' model_file = file(path, 'rb') self.feature_encoder = pickle.load(model_file) self.static_w2v_cnn = pickle.load(model_file) self.bow_randomforest = pickle.load(model_file) @staticmethod def get_feature_encoder(**kwargs): """ 获取该分类器的特征编码器 :param kwargs: 可设置参数 [ input_length(*), full_mode(#,False), feature_type(#,word),verbose(#,0)],加*表示必须提供,加#表示可选,不写则默认。 :return: """ assert kwargs.has_key('input_length'), '请提供 input_length 的属性值' from data_processing_util.feature_encoder.onehot_feature_encoder import FeatureEncoder feature_encoder = FeatureEncoder( need_segmented=kwargs.get('need_segmented', True), sentence_padding_length=kwargs['input_length'], verbose=kwargs.get('verbose', 0), full_mode=kwargs.get('full_mode', False), remove_stopword=True, replace_number=True, lowercase=True, zhs2zht=True, remove_url=True, padding_mode='center', add_unkown_word=True, feature_type=kwargs.get('feature_type', 'word'), vocabulary_including_test_set=kwargs.get( 'vocabulary_including_test_set', True), update_dictionary=kwargs.get('update_dictionary', True)) return feature_encoder def batch_predict_bestn(self, sentences, transform_input=False, bestn=1): """ 批量预测句子的类别,对输入的句子进行预测 :param sentences: 测试句子, :type sentences: array-like :param transform_input: 是否转换句子,如果为True,输入原始字符串句子即可,内部已实现转换成字典索引的形式。 :type transform_input: bool :param bestn: 预测,并取出bestn个结果。 :type bestn: int :return: y_pred_result, y_pred_score """ if transform_input: sentences = self.static_w2v_cnn.transform(sentences) # sentences = np.asarray(sentences) # assert len(sentences.shape) == 2, 'shape必须是2维的!' train_x_features = self.static_w2v_cnn.get_layer_output(sentences)[4] # print(train_x_features) # print(train_x_features.shape) return self.bow_randomforest.batch_predict_bestn(train_x_features, transform_input=False, bestn=bestn)
def __init__(self, feature_encoder, num_filter, num_labels, n_estimators, word2vec_model_file_path, **kwargs ): self.static_w2v_cnn = None self.bow_randomforest = None self.feature_encoder = feature_encoder if not kwargs.get('init_model', True): # 不初始化模型,一般在恢复模型时候用 return if kwargs.get('rand_weight', False): # CNN(rand)模式 weight = None elif kwargs['dataset_flag'] == 0: # 训练集 if RFAndWordEmbeddingCnnMerge.train_data_weight is None: # 训练集 RFAndWordEmbeddingCnnMerge.train_data_weight = feature_encoder.to_embedding_weight( word2vec_model_file_path) weight = RFAndWordEmbeddingCnnMerge.train_data_weight else: # kwargs['dataset_flag']>0 if RFAndWordEmbeddingCnnMerge.val_data_weight is None: RFAndWordEmbeddingCnnMerge.val_data_weight = feature_encoder.to_embedding_weight( word2vec_model_file_path) weight = RFAndWordEmbeddingCnnMerge.val_data_weight # print(weight) self.static_w2v_cnn = WordEmbeddingCNN( rand_seed=1377, verbose=kwargs.get('verbose', 0), feature_encoder=feature_encoder, # optimizers='adadelta', optimizers='sgd', # 当使用CNN (rand) 模式的时候使用到了 word_embedding_dim=50, # 设置embedding使用训练好的w2v模型初始化 embedding_init_weight=weight, # 默认设置为训练时embedding层权重不变 embedding_weight_trainable=kwargs.get('embedding_weight_trainable', False), num_labels=num_labels, l1_conv_filter_type=[ [num_filter, 3, -1, 'valid', (-1, 1), 0.5, 'relu', 'none'], [num_filter, 4, -1, 'valid', (-1, 1), 0., 'relu', 'none'], [num_filter, 5, -1, 'valid', (-1, 1), 0., 'relu', 'none'], ], l2_conv_filter_type=[], full_connected_layer_units=[], embedding_dropout_rate=0., nb_epoch=kwargs.get('nb_epoch', 25), batch_size=kwargs.get('batch_size', 32), earlyStoping_patience=30, lr=kwargs.get('lr', 1e-2), show_validate_accuracy=True if kwargs.get('verbose', 0) > 0 else False, # output_regularizer=('l2', 0.5), output_constraints=('maxnorm', 3), # 必须设为True,才能取中间结果做特征 save_middle_output=True, ) self.bow_randomforest = BowRandomForest( rand_seed=1377, verbose=kwargs.get('verbose', 0), feature_encoder=feature_encoder, # optimizers='adadelta', n_estimators=n_estimators, min_samples_leaf=1, )
class RFAndWordEmbeddingCnnMerge(CnnBaseClass): __version__ = '1.4' # 如果使用全体数据作为字典,则使用这个变量来存放权重,避免重复加载权重,因为每次加载的权重都是一样的。 train_data_weight = None # 验证数据是一份权重,不包含测试集了 val_data_weight = None def __init__(self, feature_encoder, num_filter, num_labels, n_estimators, word2vec_model_file_path, **kwargs ): self.static_w2v_cnn = None self.bow_randomforest = None self.feature_encoder = feature_encoder if not kwargs.get('init_model', True): # 不初始化模型,一般在恢复模型时候用 return if kwargs.get('rand_weight', False): # CNN(rand)模式 weight = None elif kwargs['dataset_flag'] == 0: # 训练集 if RFAndWordEmbeddingCnnMerge.train_data_weight is None: # 训练集 RFAndWordEmbeddingCnnMerge.train_data_weight = feature_encoder.to_embedding_weight( word2vec_model_file_path) weight = RFAndWordEmbeddingCnnMerge.train_data_weight else: # kwargs['dataset_flag']>0 if RFAndWordEmbeddingCnnMerge.val_data_weight is None: RFAndWordEmbeddingCnnMerge.val_data_weight = feature_encoder.to_embedding_weight( word2vec_model_file_path) weight = RFAndWordEmbeddingCnnMerge.val_data_weight # print(weight) self.static_w2v_cnn = WordEmbeddingCNN( rand_seed=1377, verbose=kwargs.get('verbose', 0), feature_encoder=feature_encoder, # optimizers='adadelta', optimizers='sgd', # 当使用CNN (rand) 模式的时候使用到了 word_embedding_dim=50, # 设置embedding使用训练好的w2v模型初始化 embedding_init_weight=weight, # 默认设置为训练时embedding层权重不变 embedding_weight_trainable=kwargs.get('embedding_weight_trainable', False), num_labels=num_labels, l1_conv_filter_type=[ [num_filter, 3, -1, 'valid', (-1, 1), 0.5, 'relu', 'none'], [num_filter, 4, -1, 'valid', (-1, 1), 0., 'relu', 'none'], [num_filter, 5, -1, 'valid', (-1, 1), 0., 'relu', 'none'], ], l2_conv_filter_type=[], full_connected_layer_units=[], embedding_dropout_rate=0., nb_epoch=kwargs.get('nb_epoch', 25), batch_size=kwargs.get('batch_size', 32), earlyStoping_patience=30, lr=kwargs.get('lr', 1e-2), show_validate_accuracy=True if kwargs.get('verbose', 0) > 0 else False, # output_regularizer=('l2', 0.5), output_constraints=('maxnorm', 3), # 必须设为True,才能取中间结果做特征 save_middle_output=True, ) self.bow_randomforest = BowRandomForest( rand_seed=1377, verbose=kwargs.get('verbose', 0), feature_encoder=feature_encoder, # optimizers='adadelta', n_estimators=n_estimators, min_samples_leaf=1, ) def fit(self, train_data=None, validation_data=None): train_X, train_y = train_data validation_X, validation_y = validation_data self.static_w2v_cnn.fit(train_data, validation_data) train_x_features = self.static_w2v_cnn.get_layer_output(train_X)[4] validation_x_features = self.static_w2v_cnn.get_layer_output(validation_X)[4] return self.bow_randomforest.fit((train_x_features, train_y), (validation_x_features, validation_y)) def save_model(self, path): """ 保存模型,保存成pickle形式 :param path: 模型保存的路径 :type path: 模型保存的路径 :return: """ model_file = open(path, 'wb') pickle.dump(self.feature_encoder, model_file) pickle.dump(self.static_w2v_cnn, model_file) pickle.dump(self.bow_randomforest, model_file) def model_from_pickle(self, path): ''' 从模型文件中直接加载模型 :param path: :return: RandEmbeddingCNN object ''' model_file = file(path, 'rb') self.feature_encoder = pickle.load(model_file) self.static_w2v_cnn = pickle.load(model_file) self.bow_randomforest = pickle.load(model_file) @staticmethod def get_feature_encoder(**kwargs): """ 获取该分类器的特征编码器 :param kwargs: 可设置参数 [ input_length(*), full_mode(#,False), feature_type(#,word),verbose(#,0)],加*表示必须提供,加#表示可选,不写则默认。 :return: """ assert kwargs.has_key('input_length'), '请提供 input_length 的属性值' from data_processing_util.feature_encoder.onehot_feature_encoder import FeatureEncoder feature_encoder = FeatureEncoder( need_segmented=kwargs.get('need_segmented', True), sentence_padding_length=kwargs['input_length'], verbose=kwargs.get('verbose', 0), full_mode=kwargs.get('full_mode', False), remove_stopword=True, replace_number=True, lowercase=True, zhs2zht=True, remove_url=True, padding_mode='center', add_unkown_word=True, feature_type=kwargs.get('feature_type', 'word'), vocabulary_including_test_set=kwargs.get('vocabulary_including_test_set', True), update_dictionary=kwargs.get('update_dictionary', True) ) return feature_encoder def batch_predict_bestn(self, sentences, transform_input=False, bestn=1): """ 批量预测句子的类别,对输入的句子进行预测 :param sentences: 测试句子, :type sentences: array-like :param transform_input: 是否转换句子,如果为True,输入原始字符串句子即可,内部已实现转换成字典索引的形式。 :type transform_input: bool :param bestn: 预测,并取出bestn个结果。 :type bestn: int :return: y_pred_result, y_pred_score """ if transform_input: sentences = self.static_w2v_cnn.transform(sentences) # sentences = np.asarray(sentences) # assert len(sentences.shape) == 2, 'shape必须是2维的!' train_x_features = self.static_w2v_cnn.get_layer_output(sentences)[4] # print(train_x_features) # print(train_x_features.shape) return self.bow_randomforest.batch_predict_bestn(train_x_features, transform_input=False, bestn=bestn)
cv_data = data_util.get_k_fold_data(k=3, data=train_data, rand_seed=0, ) WordEmbeddingCNN.cross_validation( cv_data, (test_data[u'SENTENCE'].as_matrix(), test_y), 'result/static_W2V_%s_cv_detail.txt'%feature_type, rand_seed=rand_seed, nb_epoch=nb_epoch, verbose=verbose, feature_type=feature_type, full_mode=False, layer1=layer1, l1_conv_filter_type=l1_conv_filter_type, layer2=layer2, l2_conv_filter_type=l2_conv_filter_type, k=k, hidden1=hidden1, hidden2=hidden2, word_embedding_dim = word_embedding_dim, sentence_padding_length = sentence_padding_length, word2vec_model_file_path=data_util.transform_word2vec_model_name('%dd_weibo_100w'%word_embedding_dim), embedding_weight_trainable = True, ) end_time = timeit.default_timer() print 'end! Running time:%ds!' % (end_time - start_time) logging.debug('=' * 20)