# test dataset X-y test_X = test_data['SENTENCE'].as_matrix() test_y = test_data['LABEL_INDEX'].as_matrix() # endregion -------------- 加载训练数据和测试数据 --------------- # region -------------- cross validation ------------- if config['verbose'] > 0: print('-' * 20) print('cross validation') from traditional_classify.bow_rf.bow_rf_model import BowRandomForest BowRandomForest.cross_validation( train_data=(train_X,train_y), test_data=(test_X,test_y), shuffle_data=True, n_estimators_list=estimator_paramter_list, feature_type=feature_type, word2vec_to_solve_oov=False, # word2vec_model_file_path=None, verbose=config['verbose'], cv=3, need_segmented=True, need_validation=True, include_train_data=True, ) if config['verbose'] > 0: print('-' * 20) # endregion -------------- cross validation ---------------
]) # region -------------- cross validation ------------- if config['verbose'] > 0: print('-' * 20) print('cross validation') from traditional_classify.bow_rf.bow_rf_model import BowRandomForest BowRandomForest.cross_validation( train_data=None, test_data=None, cv_data=cv_data, shuffle_data=True, n_estimators_list=estimator_paramter_list, # feature_type=feature_type, word2vec_to_solve_oov=False, # word2vec_model_file_path=None, verbose=config['verbose'], cv=3, # 直接输入 need_transform_input=False, # need_segmented=False, need_validation=True, include_train_data=True, ) if config['verbose'] > 0: print('-' * 20) # endregion -------------- cross validation ---------------
def __init__(self, feature_encoder, num_filter, num_labels, n_estimators, word2vec_model_file_path, **kwargs): if kwargs.get('rand_weight', False): # CNN(rand)模式 weight = None elif kwargs['dataset_flag'] == 0: if RFAndWordEmbeddingCnnMerge.train_data_weight is None: # 训练集 RFAndWordEmbeddingCnnMerge.train_data_weight = feature_encoder.to_embedding_weight( word2vec_model_file_path) weight = RFAndWordEmbeddingCnnMerge.train_data_weight else: # kwargs['dataset_flag']>0 if RFAndWordEmbeddingCnnMerge.val_data_weight is None: RFAndWordEmbeddingCnnMerge.val_data_weight = feature_encoder.to_embedding_weight( word2vec_model_file_path) weight = RFAndWordEmbeddingCnnMerge.val_data_weight # print(weight) self.static_w2v_cnn = WordEmbeddingCNN( rand_seed=1377, verbose=kwargs.get('verbose', 0), feature_encoder=feature_encoder, # optimizers='adadelta', optimizers='sgd', # 当使用CNN (rand) 模式的时候使用到了 word_embedding_dim=50, # 设置embedding使用训练好的w2v模型初始化 embedding_init_weight=weight, # 默认设置为训练时embedding层权重不变 embedding_weight_trainable=kwargs.get('embedding_weight_trainable', False), num_labels=num_labels, l1_conv_filter_type=[ [num_filter, 3, -1, 'valid', (-1, 1), 0.5, 'relu', 'none'], [num_filter, 4, -1, 'valid', (-1, 1), 0., 'relu', 'none'], [num_filter, 5, -1, 'valid', (-1, 1), 0., 'relu', 'none'], ], l2_conv_filter_type=[], full_connected_layer_units=[], embedding_dropout_rate=0., nb_epoch=kwargs.get('nb_epoch', 25), batch_size=kwargs.get('batch_size', 32), earlyStoping_patience=30, lr=kwargs.get('lr', 1e-2), show_validate_accuracy=True if kwargs.get('verbose', 0) > 0 else False, # output_regularizer=('l2', 0.5), output_constraints=('maxnorm', 3), # 必须设为True,才能取中间结果做特征 save_middle_output=True, ) self.bow_randomforest = BowRandomForest( rand_seed=1377, verbose=kwargs.get('verbose', 0), feature_encoder=feature_encoder, # optimizers='adadelta', n_estimators=n_estimators, min_samples_leaf=1, )
} from version_2.data_processing.data_util import DataUtil data_util = DataUtil() train_data, test_data = data_util.load_train_test_data(config) label_to_index, index_to_label = data_util.get_label_index() train_x = train_data['TEXT'].as_matrix() train_y = train_data['STANCE_INDEX'].as_matrix() test_x = test_data['TEXT'].as_matrix() test_y = test_data['STANCE_INDEX'].as_matrix() from traditional_classify.bow_rf.bow_rf_model import BowRandomForest BowRandomForest.cross_validation( train_data=(train_x, train_y), test_data=(test_x, test_y), shuffle_data = False, # n_estimators_list = [10,20,30,40,50,60,70,80,90,100,200,300,400,500,1000,2000,3000,4000,5000], n_estimators_list = range(10,1010,10), # n_estimators_list = [610], verbose=0, feature_type = 'word', word2vec_to_solve_oov=False, word2vec_model_file_path = '/home/jdwang/PycharmProjects/corprocessor/word2vec/vector/50dim/vector1000000_50dim.gem' )
logging.debug('start running!') logging.debug('=' * 20) from coprocessor.Corpus.ood_dataset.stable_vesion.data_util import DataUtil data_util = DataUtil() train_data, test_data = data_util.load_train_test_data(config) label_to_index, index_to_label = data_util.get_label_index() train_x = train_data['SENTENCE'].as_matrix() train_y = train_data['LABEL_INDEX'].as_matrix() test_x = test_data['SENTENCE'].as_matrix() test_y = test_data['LABEL_INDEX'].as_matrix() from traditional_classify.bow_rf.bow_rf_model import BowRandomForest BowRandomForest.cross_validation( train_data=(train_x, train_y), test_data=(test_x, test_y), shuffle_data = False, # n_estimators_list = [10,20,30,40,50,60,70,80,90,100,200,300,400,500,1000,2000,3000,4000,5000], n_estimators_list = [640,470,490], # n_estimators_list = [290], # n_estimators_list = range(10,1010,10), # n_estimators_list = [330], verbose=0, feature_type = 'word', word2vec_to_solve_oov=True, word2vec_model_file_path = data_util.transform_word2vec_model_name('50d_weibo_100w') )
rand_seed=3, ) verbose =1 shuffle_data = True import numpy as np rand_seed = np.random.randint(0,1e5) # rand_seed = 1000 BowRandomForest.cross_validation( cv_data, (test_data[u'SENTENCE'].as_matrix(), test_y), 'result/rf_bow_cv_detail.txt', verbose=0, rand_seed = rand_seed, shuffle_data = shuffle_data, feature_type='seg', n_estimators = [10,20,30,40,50,60,70,80,90,100,200,300,400,500,1000,2000,3000,4000,5000], # n_estimators = [2000], remove_stopword = True, word2vec_to_solve_oov = False, word2vec_model_file_path = config['word2vec_model_file_path'], ) end_time = timeit.default_timer() print 'end! Running time:%ds!' % (end_time - start_time) logging.debug('=' * 20) logging.debug('end! Running time:%ds!' % (end_time - start_time))
dev_y.extend(list(train_y_10fold[i])) print('train data size:%d'%len(dev_y)) x = dev_x y = dev_y if shuffle_data: # 打乱数据 x = np.random.RandomState(seed).permutation(x) y = np.random.RandomState(seed).permutation(y) # print(dev_y) dev_X_feature = feature_encoder.fit_transform(x) test_X_feature = feature_encoder.transform(test_data[u'SENTENCE'].as_matrix()) bow_rf = BowRandomForest( # rand_seed=rand_seed, verbose=0, n_estimators=estimators, min_samples_leaf=1, feature_encoder=None, ) bow_rf.fit(train_data=(dev_X_feature, y), validation_data=(test_X_feature, test_y)) _, _, dev_accuracy, _ = bow_rf.accuracy((dev_X_feature, y), False) _, _, val_accuracy, _ = bow_rf.accuracy((test_X_feature, test_y), False) train_accu.append(dev_accuracy) test_accu.append(val_accuracy) print('-' * 80) print('#***#训练准确率:%s'%(train_accu))
test_X_feature = feature_encoder.transform(test_data[u'SENTENCE'].as_matrix()) print(train_X_feature) feature_encoder.print_model_descibe() keywords = feature_encoder.vocabulary print ','.join(keywords) print len(keywords) else: feature_encoder = None # quit() # for seed in [1,10,100,500,1000,2000,10000]: seed = 0 for n_estimators in [10,100,200,300,400,500,800,1000,2000,5000]: bow_rf = BowRandomForest( rand_seed=seed, verbose=config['verbose'], n_estimators=n_estimators, min_samples_leaf=1, feature_encoder=feature_encoder, ) model_file_path = ''.join([str(item) for item in config['model_file_path']]) result_file_path = ''.join([str(item) for item in config['result_file_path']]) result_file_path = result_file_path%seed print model_file_path print result_file_path # quit() if config['refresh_all_model']: bow_rf.fit(train_data=(train_X_feature, train_data['LABEL_INDEX']), validation_data=(test_X_feature, test_data['LABEL_INDEX'])) bow_rf.save_model(model_file_path) else:
train_y = train_data['LABEL_INDEX'].as_matrix() # test dataset X-y test_X = test_data['SENTENCE'].as_matrix() test_y = test_data['LABEL_INDEX'].as_matrix() print('=' * 30 + '数据加载完毕' + '=' * 30) # endregion # region -------------- 3 cross validation ------------- if config['verbose'] > 0: print('-' * 20) print('cross validation') BowRandomForest.cross_validation( train_data=(train_X, train_y), test_data=(test_X, test_y), shuffle_data=True, n_estimators_list=estimator_paramter_list, feature_type=feature_type, word2vec_to_solve_oov=False, # word2vec_model_file_path=None, verbose=config['verbose'], cv=3, need_segmented=True, need_validation=True, include_train_data=True, ) if config['verbose'] > 0: print('-' * 20) # endregion
# region -------------- cross validation ------------- if config['verbose'] > 0: print('-' * 20) print('cross validation') from traditional_classify.bow_rf.bow_rf_model import BowRandomForest BowRandomForest.cross_validation( train_data=None, test_data=None, cv_data=cv_data, shuffle_data=True, n_estimators_list=estimator_paramter_list, # feature_type=feature_type, word2vec_to_solve_oov=False, # word2vec_model_file_path=None, verbose=config['verbose'], cv=3, # 直接输入 need_transform_input=False, # need_segmented=False, need_validation=True, include_train_data=True, ) if config['verbose'] > 0: print('-' * 20) # endregion -------------- cross validation ---------------
# ------------------------------------------------------------------------------ logging.debug('=' * 20) logging.debug('将数据转为特征') logging.debug('=' * 20) logging.debug('对数据进行分词...') logging.debug('-' * 20) # print(train_data.head()) logging.debug('=' * 20) logging.debug('开始生成特征向量...') feature_encoder = BowRandomForest.get_feature_encoder( verbose=config['verbose'], feature_type=feature_type, word2vec_to_solve_oov=word2vec_to_solve_oov, word2vec_model_file_path=config['word2vec_model_file_path'], ) # print(train_X_feature) # feature_encoder.print_model_descibe() # keywords = feature_encoder.vocabulary # print ','.join(keywords) # print '字典个数:%d'%len(keywords) # quit() train_X_10fold,train_y_10fold = data_util.get_k_fold_data( k=k_of_fold, data=train_data, rand_seed=10, )
config = { 'verbose': 1, } from version_2.data_processing.data_util import DataUtil data_util = DataUtil() train_data, test_data = data_util.load_train_test_data(config) label_to_index, index_to_label = data_util.get_label_index() train_x = train_data['TEXT'].as_matrix() train_y = train_data['STANCE_INDEX'].as_matrix() test_x = test_data['TEXT'].as_matrix() test_y = test_data['STANCE_INDEX'].as_matrix() from traditional_classify.bow_rf.bow_rf_model import BowRandomForest BowRandomForest.cross_validation( train_data=(train_x, train_y), test_data=(test_x, test_y), shuffle_data=False, # n_estimators_list = [10,20,30,40,50,60,70,80,90,100,200,300,400,500,1000,2000,3000,4000,5000], n_estimators_list=range(10, 1010, 10), # n_estimators_list = [610], verbose=0, feature_type='word', word2vec_to_solve_oov=False, word2vec_model_file_path= '/home/jdwang/PycharmProjects/corprocessor/word2vec/vector/50dim/vector1000000_50dim.gem' )
class RFAndWordEmbeddingCnnMerge(CnnBaseClass): __version__ = '1.4' # 如果使用全体数据作为字典,则使用这个变量来存放权重,避免重复加载权重,因为每次加载的权重都是一样的。 train_data_weight = None # 验证数据是一份权重,不包含测试集了 val_data_weight = None def __init__(self, feature_encoder, num_filter, num_labels, n_estimators, word2vec_model_file_path, **kwargs): self.static_w2v_cnn = None self.bow_randomforest = None self.feature_encoder = feature_encoder if not kwargs.get('init_model', True): # 不初始化模型,一般在恢复模型时候用 return if kwargs.get('rand_weight', False): # CNN(rand)模式 weight = None elif kwargs['dataset_flag'] == 0: # 训练集 if RFAndWordEmbeddingCnnMerge.train_data_weight is None: # 训练集 RFAndWordEmbeddingCnnMerge.train_data_weight = feature_encoder.to_embedding_weight( word2vec_model_file_path) weight = RFAndWordEmbeddingCnnMerge.train_data_weight else: # kwargs['dataset_flag']>0 if RFAndWordEmbeddingCnnMerge.val_data_weight is None: RFAndWordEmbeddingCnnMerge.val_data_weight = feature_encoder.to_embedding_weight( word2vec_model_file_path) weight = RFAndWordEmbeddingCnnMerge.val_data_weight # print(weight) self.static_w2v_cnn = WordEmbeddingCNN( rand_seed=1377, verbose=kwargs.get('verbose', 0), feature_encoder=feature_encoder, # optimizers='adadelta', optimizers='sgd', # 当使用CNN (rand) 模式的时候使用到了 word_embedding_dim=50, # 设置embedding使用训练好的w2v模型初始化 embedding_init_weight=weight, # 默认设置为训练时embedding层权重不变 embedding_weight_trainable=kwargs.get('embedding_weight_trainable', False), num_labels=num_labels, l1_conv_filter_type=[ [num_filter, 3, -1, 'valid', (-1, 1), 0.5, 'relu', 'none'], [num_filter, 4, -1, 'valid', (-1, 1), 0., 'relu', 'none'], [num_filter, 5, -1, 'valid', (-1, 1), 0., 'relu', 'none'], ], l2_conv_filter_type=[], full_connected_layer_units=[], embedding_dropout_rate=0., nb_epoch=kwargs.get('nb_epoch', 25), batch_size=kwargs.get('batch_size', 32), earlyStoping_patience=30, lr=kwargs.get('lr', 1e-2), show_validate_accuracy=True if kwargs.get('verbose', 0) > 0 else False, # output_regularizer=('l2', 0.5), output_constraints=('maxnorm', 3), # 必须设为True,才能取中间结果做特征 save_middle_output=True, ) self.bow_randomforest = BowRandomForest( rand_seed=1377, verbose=kwargs.get('verbose', 0), feature_encoder=feature_encoder, # optimizers='adadelta', n_estimators=n_estimators, min_samples_leaf=1, ) def fit(self, train_data=None, validation_data=None): train_X, train_y = train_data validation_X, validation_y = validation_data self.static_w2v_cnn.fit(train_data, validation_data) train_x_features = self.static_w2v_cnn.get_layer_output(train_X)[4] validation_x_features = self.static_w2v_cnn.get_layer_output( validation_X)[4] return self.bow_randomforest.fit((train_x_features, train_y), (validation_x_features, validation_y)) def save_model(self, path): """ 保存模型,保存成pickle形式 :param path: 模型保存的路径 :type path: 模型保存的路径 :return: """ model_file = open(path, 'wb') pickle.dump(self.feature_encoder, model_file) pickle.dump(self.static_w2v_cnn, model_file) pickle.dump(self.bow_randomforest, model_file) def model_from_pickle(self, path): ''' 从模型文件中直接加载模型 :param path: :return: RandEmbeddingCNN object ''' model_file = file(path, 'rb') self.feature_encoder = pickle.load(model_file) self.static_w2v_cnn = pickle.load(model_file) self.bow_randomforest = pickle.load(model_file) @staticmethod def get_feature_encoder(**kwargs): """ 获取该分类器的特征编码器 :param kwargs: 可设置参数 [ input_length(*), full_mode(#,False), feature_type(#,word),verbose(#,0)],加*表示必须提供,加#表示可选,不写则默认。 :return: """ assert kwargs.has_key('input_length'), '请提供 input_length 的属性值' from data_processing_util.feature_encoder.onehot_feature_encoder import FeatureEncoder feature_encoder = FeatureEncoder( need_segmented=kwargs.get('need_segmented', True), sentence_padding_length=kwargs['input_length'], verbose=kwargs.get('verbose', 0), full_mode=kwargs.get('full_mode', False), remove_stopword=True, replace_number=True, lowercase=True, zhs2zht=True, remove_url=True, padding_mode='center', add_unkown_word=True, feature_type=kwargs.get('feature_type', 'word'), vocabulary_including_test_set=kwargs.get( 'vocabulary_including_test_set', True), update_dictionary=kwargs.get('update_dictionary', True)) return feature_encoder def batch_predict_bestn(self, sentences, transform_input=False, bestn=1): """ 批量预测句子的类别,对输入的句子进行预测 :param sentences: 测试句子, :type sentences: array-like :param transform_input: 是否转换句子,如果为True,输入原始字符串句子即可,内部已实现转换成字典索引的形式。 :type transform_input: bool :param bestn: 预测,并取出bestn个结果。 :type bestn: int :return: y_pred_result, y_pred_score """ if transform_input: sentences = self.static_w2v_cnn.transform(sentences) # sentences = np.asarray(sentences) # assert len(sentences.shape) == 2, 'shape必须是2维的!' train_x_features = self.static_w2v_cnn.get_layer_output(sentences)[4] # print(train_x_features) # print(train_x_features.shape) return self.bow_randomforest.batch_predict_bestn(train_x_features, transform_input=False, bestn=bestn)
print(train_X_feature) logging.debug('=' * 20) # ------------------------------------------------------------------------------ # -------------- region end : 将数据转为特征 --------------- # ------------------------------------------------------------------------------ # for seed in [1,10,100,500,1000,2000,10000]: for n_estimators in estimators: print('='*20) print('随机森林棵树:%d'%(n_estimators)) bow_rf = BowRandomForest( # rand_seed=seed, verbose=config['verbose'], n_estimators=n_estimators, min_samples_leaf=1, feature_encoder=None, ) model_file_path = ''.join([str(item) for item in config['model_file_path']]) result_file_path = 'rf_bow_%s_%dtree.csv'%(feature_type,n_estimators) print model_file_path print result_file_path # quit() if config['refresh_all_model']: bow_rf.fit(train_data=(train_X_feature, train_y), validation_data=(test_X_feature,test_y)) bow_rf.save_model(model_file_path) else:
def __init__(self, feature_encoder, num_filter, num_labels, n_estimators, word2vec_model_file_path, **kwargs ): self.static_w2v_cnn = None self.bow_randomforest = None self.feature_encoder = feature_encoder if not kwargs.get('init_model', True): # 不初始化模型,一般在恢复模型时候用 return if kwargs.get('rand_weight', False): # CNN(rand)模式 weight = None elif kwargs['dataset_flag'] == 0: # 训练集 if RFAndWordEmbeddingCnnMerge.train_data_weight is None: # 训练集 RFAndWordEmbeddingCnnMerge.train_data_weight = feature_encoder.to_embedding_weight( word2vec_model_file_path) weight = RFAndWordEmbeddingCnnMerge.train_data_weight else: # kwargs['dataset_flag']>0 if RFAndWordEmbeddingCnnMerge.val_data_weight is None: RFAndWordEmbeddingCnnMerge.val_data_weight = feature_encoder.to_embedding_weight( word2vec_model_file_path) weight = RFAndWordEmbeddingCnnMerge.val_data_weight # print(weight) self.static_w2v_cnn = WordEmbeddingCNN( rand_seed=1377, verbose=kwargs.get('verbose', 0), feature_encoder=feature_encoder, # optimizers='adadelta', optimizers='sgd', # 当使用CNN (rand) 模式的时候使用到了 word_embedding_dim=50, # 设置embedding使用训练好的w2v模型初始化 embedding_init_weight=weight, # 默认设置为训练时embedding层权重不变 embedding_weight_trainable=kwargs.get('embedding_weight_trainable', False), num_labels=num_labels, l1_conv_filter_type=[ [num_filter, 3, -1, 'valid', (-1, 1), 0.5, 'relu', 'none'], [num_filter, 4, -1, 'valid', (-1, 1), 0., 'relu', 'none'], [num_filter, 5, -1, 'valid', (-1, 1), 0., 'relu', 'none'], ], l2_conv_filter_type=[], full_connected_layer_units=[], embedding_dropout_rate=0., nb_epoch=kwargs.get('nb_epoch', 25), batch_size=kwargs.get('batch_size', 32), earlyStoping_patience=30, lr=kwargs.get('lr', 1e-2), show_validate_accuracy=True if kwargs.get('verbose', 0) > 0 else False, # output_regularizer=('l2', 0.5), output_constraints=('maxnorm', 3), # 必须设为True,才能取中间结果做特征 save_middle_output=True, ) self.bow_randomforest = BowRandomForest( rand_seed=1377, verbose=kwargs.get('verbose', 0), feature_encoder=feature_encoder, # optimizers='adadelta', n_estimators=n_estimators, min_samples_leaf=1, )
class RFAndWordEmbeddingCnnMerge(CnnBaseClass): __version__ = '1.4' # 如果使用全体数据作为字典,则使用这个变量来存放权重,避免重复加载权重,因为每次加载的权重都是一样的。 train_data_weight = None # 验证数据是一份权重,不包含测试集了 val_data_weight = None def __init__(self, feature_encoder, num_filter, num_labels, n_estimators, word2vec_model_file_path, **kwargs ): self.static_w2v_cnn = None self.bow_randomforest = None self.feature_encoder = feature_encoder if not kwargs.get('init_model', True): # 不初始化模型,一般在恢复模型时候用 return if kwargs.get('rand_weight', False): # CNN(rand)模式 weight = None elif kwargs['dataset_flag'] == 0: # 训练集 if RFAndWordEmbeddingCnnMerge.train_data_weight is None: # 训练集 RFAndWordEmbeddingCnnMerge.train_data_weight = feature_encoder.to_embedding_weight( word2vec_model_file_path) weight = RFAndWordEmbeddingCnnMerge.train_data_weight else: # kwargs['dataset_flag']>0 if RFAndWordEmbeddingCnnMerge.val_data_weight is None: RFAndWordEmbeddingCnnMerge.val_data_weight = feature_encoder.to_embedding_weight( word2vec_model_file_path) weight = RFAndWordEmbeddingCnnMerge.val_data_weight # print(weight) self.static_w2v_cnn = WordEmbeddingCNN( rand_seed=1377, verbose=kwargs.get('verbose', 0), feature_encoder=feature_encoder, # optimizers='adadelta', optimizers='sgd', # 当使用CNN (rand) 模式的时候使用到了 word_embedding_dim=50, # 设置embedding使用训练好的w2v模型初始化 embedding_init_weight=weight, # 默认设置为训练时embedding层权重不变 embedding_weight_trainable=kwargs.get('embedding_weight_trainable', False), num_labels=num_labels, l1_conv_filter_type=[ [num_filter, 3, -1, 'valid', (-1, 1), 0.5, 'relu', 'none'], [num_filter, 4, -1, 'valid', (-1, 1), 0., 'relu', 'none'], [num_filter, 5, -1, 'valid', (-1, 1), 0., 'relu', 'none'], ], l2_conv_filter_type=[], full_connected_layer_units=[], embedding_dropout_rate=0., nb_epoch=kwargs.get('nb_epoch', 25), batch_size=kwargs.get('batch_size', 32), earlyStoping_patience=30, lr=kwargs.get('lr', 1e-2), show_validate_accuracy=True if kwargs.get('verbose', 0) > 0 else False, # output_regularizer=('l2', 0.5), output_constraints=('maxnorm', 3), # 必须设为True,才能取中间结果做特征 save_middle_output=True, ) self.bow_randomforest = BowRandomForest( rand_seed=1377, verbose=kwargs.get('verbose', 0), feature_encoder=feature_encoder, # optimizers='adadelta', n_estimators=n_estimators, min_samples_leaf=1, ) def fit(self, train_data=None, validation_data=None): train_X, train_y = train_data validation_X, validation_y = validation_data self.static_w2v_cnn.fit(train_data, validation_data) train_x_features = self.static_w2v_cnn.get_layer_output(train_X)[4] validation_x_features = self.static_w2v_cnn.get_layer_output(validation_X)[4] return self.bow_randomforest.fit((train_x_features, train_y), (validation_x_features, validation_y)) def save_model(self, path): """ 保存模型,保存成pickle形式 :param path: 模型保存的路径 :type path: 模型保存的路径 :return: """ model_file = open(path, 'wb') pickle.dump(self.feature_encoder, model_file) pickle.dump(self.static_w2v_cnn, model_file) pickle.dump(self.bow_randomforest, model_file) def model_from_pickle(self, path): ''' 从模型文件中直接加载模型 :param path: :return: RandEmbeddingCNN object ''' model_file = file(path, 'rb') self.feature_encoder = pickle.load(model_file) self.static_w2v_cnn = pickle.load(model_file) self.bow_randomforest = pickle.load(model_file) @staticmethod def get_feature_encoder(**kwargs): """ 获取该分类器的特征编码器 :param kwargs: 可设置参数 [ input_length(*), full_mode(#,False), feature_type(#,word),verbose(#,0)],加*表示必须提供,加#表示可选,不写则默认。 :return: """ assert kwargs.has_key('input_length'), '请提供 input_length 的属性值' from data_processing_util.feature_encoder.onehot_feature_encoder import FeatureEncoder feature_encoder = FeatureEncoder( need_segmented=kwargs.get('need_segmented', True), sentence_padding_length=kwargs['input_length'], verbose=kwargs.get('verbose', 0), full_mode=kwargs.get('full_mode', False), remove_stopword=True, replace_number=True, lowercase=True, zhs2zht=True, remove_url=True, padding_mode='center', add_unkown_word=True, feature_type=kwargs.get('feature_type', 'word'), vocabulary_including_test_set=kwargs.get('vocabulary_including_test_set', True), update_dictionary=kwargs.get('update_dictionary', True) ) return feature_encoder def batch_predict_bestn(self, sentences, transform_input=False, bestn=1): """ 批量预测句子的类别,对输入的句子进行预测 :param sentences: 测试句子, :type sentences: array-like :param transform_input: 是否转换句子,如果为True,输入原始字符串句子即可,内部已实现转换成字典索引的形式。 :type transform_input: bool :param bestn: 预测,并取出bestn个结果。 :type bestn: int :return: y_pred_result, y_pred_score """ if transform_input: sentences = self.static_w2v_cnn.transform(sentences) # sentences = np.asarray(sentences) # assert len(sentences.shape) == 2, 'shape必须是2维的!' train_x_features = self.static_w2v_cnn.get_layer_output(sentences)[4] # print(train_x_features) # print(train_x_features.shape) return self.bow_randomforest.batch_predict_bestn(train_x_features, transform_input=False, bestn=bestn)