Esempio n. 1
0
# test dataset X-y
test_X = test_data['SENTENCE'].as_matrix()
test_y = test_data['LABEL_INDEX'].as_matrix()
# endregion -------------- 加载训练数据和测试数据 ---------------

# region -------------- cross validation -------------
if config['verbose'] > 0:
    print('-' * 20)
    print('cross validation')

from traditional_classify.bow_rf.bow_rf_model import BowRandomForest

BowRandomForest.cross_validation(
    train_data=(train_X,train_y),
    test_data=(test_X,test_y),
    shuffle_data=True,
    n_estimators_list=estimator_paramter_list,
    feature_type=feature_type,
    word2vec_to_solve_oov=False,
    # word2vec_model_file_path=None,
    verbose=config['verbose'],
    cv=3,
    need_segmented=True,
    need_validation=True,
    include_train_data=True,
)

if config['verbose'] > 0:
    print('-' * 20)
# endregion -------------- cross validation ---------------
Esempio n. 2
0
logging.debug('start running!')
logging.debug('=' * 20)

from coprocessor.Corpus.ood_dataset.stable_vesion.data_util import DataUtil

data_util = DataUtil()
train_data, test_data = data_util.load_train_test_data(config)
label_to_index, index_to_label = data_util.get_label_index()

train_x = train_data['SENTENCE'].as_matrix()
train_y = train_data['LABEL_INDEX'].as_matrix()
test_x = test_data['SENTENCE'].as_matrix()
test_y = test_data['LABEL_INDEX'].as_matrix()

from traditional_classify.bow_rf.bow_rf_model import BowRandomForest

BowRandomForest.cross_validation(
    train_data=(train_x, train_y),
    test_data=(test_x, test_y),
    shuffle_data = False,
    # n_estimators_list = [10,20,30,40,50,60,70,80,90,100,200,300,400,500,1000,2000,3000,4000,5000],
    n_estimators_list = [640,470,490],
    # n_estimators_list = [290],
    # n_estimators_list = range(10,1010,10),
    # n_estimators_list = [330],
    verbose=0,
    feature_type = 'word',
    word2vec_to_solve_oov=True,
    word2vec_model_file_path = data_util.transform_word2vec_model_name('50d_weibo_100w')

)
Esempio n. 3
0
    ])

# region -------------- cross validation -------------
if config['verbose'] > 0:
    print('-' * 20)
    print('cross validation')

from traditional_classify.bow_rf.bow_rf_model import BowRandomForest

BowRandomForest.cross_validation(
    train_data=None,
    test_data=None,
    cv_data=cv_data,
    shuffle_data=True,
    n_estimators_list=estimator_paramter_list,
    # feature_type=feature_type,
    word2vec_to_solve_oov=False,
    # word2vec_model_file_path=None,
    verbose=config['verbose'],
    cv=3,
    # 直接输入
    need_transform_input=False,
    # need_segmented=False,
    need_validation=True,
    include_train_data=True,
)

if config['verbose'] > 0:
    print('-' * 20)
# endregion -------------- cross validation ---------------
Esempio n. 4
0
}

from version_2.data_processing.data_util import DataUtil

data_util = DataUtil()
train_data, test_data = data_util.load_train_test_data(config)
label_to_index, index_to_label = data_util.get_label_index()

train_x = train_data['TEXT'].as_matrix()
train_y = train_data['STANCE_INDEX'].as_matrix()
test_x = test_data['TEXT'].as_matrix()
test_y = test_data['STANCE_INDEX'].as_matrix()



from traditional_classify.bow_rf.bow_rf_model import BowRandomForest

BowRandomForest.cross_validation(
    train_data=(train_x, train_y),
    test_data=(test_x, test_y),
    shuffle_data = False,
    # n_estimators_list = [10,20,30,40,50,60,70,80,90,100,200,300,400,500,1000,2000,3000,4000,5000],
    n_estimators_list = range(10,1010,10),
    # n_estimators_list = [610],
    verbose=0,
    feature_type = 'word',
    word2vec_to_solve_oov=False,
    word2vec_model_file_path = '/home/jdwang/PycharmProjects/corprocessor/word2vec/vector/50dim/vector1000000_50dim.gem'


)
Esempio n. 5
0
                                    rand_seed=3,
                                    )

verbose =1

shuffle_data = True
import numpy as np
rand_seed = np.random.randint(0,1e5)
# rand_seed = 1000

BowRandomForest.cross_validation(
    cv_data,
    (test_data[u'SENTENCE'].as_matrix(), test_y),
    'result/rf_bow_cv_detail.txt',
    verbose=0,
    rand_seed = rand_seed,
    shuffle_data = shuffle_data,
    feature_type='seg',
    n_estimators = [10,20,30,40,50,60,70,80,90,100,200,300,400,500,1000,2000,3000,4000,5000],
    # n_estimators = [2000],
    remove_stopword = True,
    word2vec_to_solve_oov = False,
    word2vec_model_file_path = config['word2vec_model_file_path'],
)


end_time = timeit.default_timer()
print 'end! Running time:%ds!' % (end_time - start_time)
logging.debug('=' * 20)
logging.debug('end! Running time:%ds!' % (end_time - start_time))
Esempio n. 6
0
train_y = train_data['LABEL_INDEX'].as_matrix()
# test dataset X-y
test_X = test_data['SENTENCE'].as_matrix()
test_y = test_data['LABEL_INDEX'].as_matrix()
print('=' * 30 + '数据加载完毕' + '=' * 30)
# endregion

# region -------------- 3 cross validation -------------
if config['verbose'] > 0:
    print('-' * 20)
    print('cross validation')

BowRandomForest.cross_validation(
    train_data=(train_X, train_y),
    test_data=(test_X, test_y),
    shuffle_data=True,
    n_estimators_list=estimator_paramter_list,
    feature_type=feature_type,
    word2vec_to_solve_oov=False,
    # word2vec_model_file_path=None,
    verbose=config['verbose'],
    cv=3,
    need_segmented=True,
    need_validation=True,
    include_train_data=True,
)

if config['verbose'] > 0:
    print('-' * 20)
# endregion
Esempio n. 7
0

# region -------------- cross validation -------------
if config['verbose'] > 0:
    print('-' * 20)
    print('cross validation')

from traditional_classify.bow_rf.bow_rf_model import BowRandomForest

BowRandomForest.cross_validation(
    train_data=None,
    test_data=None,
    cv_data=cv_data,
    shuffle_data=True,
    n_estimators_list=estimator_paramter_list,
    # feature_type=feature_type,
    word2vec_to_solve_oov=False,
    # word2vec_model_file_path=None,
    verbose=config['verbose'],
    cv=3,
    # 直接输入
    need_transform_input=False,
    # need_segmented=False,
    need_validation=True,
    include_train_data=True,
)

if config['verbose'] > 0:
    print('-' * 20)
# endregion -------------- cross validation ---------------
config = {
    'verbose': 1,
}

from version_2.data_processing.data_util import DataUtil

data_util = DataUtil()
train_data, test_data = data_util.load_train_test_data(config)
label_to_index, index_to_label = data_util.get_label_index()

train_x = train_data['TEXT'].as_matrix()
train_y = train_data['STANCE_INDEX'].as_matrix()
test_x = test_data['TEXT'].as_matrix()
test_y = test_data['STANCE_INDEX'].as_matrix()

from traditional_classify.bow_rf.bow_rf_model import BowRandomForest

BowRandomForest.cross_validation(
    train_data=(train_x, train_y),
    test_data=(test_x, test_y),
    shuffle_data=False,
    # n_estimators_list = [10,20,30,40,50,60,70,80,90,100,200,300,400,500,1000,2000,3000,4000,5000],
    n_estimators_list=range(10, 1010, 10),
    # n_estimators_list = [610],
    verbose=0,
    feature_type='word',
    word2vec_to_solve_oov=False,
    word2vec_model_file_path=
    '/home/jdwang/PycharmProjects/corprocessor/word2vec/vector/50dim/vector1000000_50dim.gem'
)