from sklearn import svm,tree,linear_model from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer from read_utils import TextConverter train_files = '../data/cnews.train.txt' val_files = '../data/cnews.val.txt' test_files = '../data/cnews.test.txt' save_file = 'cnews.vocab_label.pkl' converter = TextConverter(train_files, save_file, max_vocab=5000) print(converter.vocab_size) print(converter.label) train_texts, train_labels = converter.load_data(train_files) # train_x, train_y = converter.texts_to_arr(train_texts, train_labels) val_texts, val_labels = converter.load_data(val_files) # val_x, val_y = converter.texts_to_arr(val_texts, val_labels) test_texts, test_labels = converter.load_data(test_files) # test_x, test_y = converter.texts_to_arr(test_texts, test_labels) # -------------feature extract -------------------- vec = TfidfVectorizer(ngram_range=(1,1),min_df=3, max_df=0.9,use_idf=1,smooth_idf=1, sublinear_tf=1, token_pattern=r"(?u)\w") train_features = vec.fit_transform(train_texts).toarray() val_features = vec.transform(val_texts).toarray() test_features = vec.transform(test_texts).toarray()
os.makedirs(model_path) train_files = '../data/cnews.train.txt' val_files = '../data/cnews.val.txt' test_files = '../data/cnews.test.txt' save_file = 'cnews.vocab_label.pkl' # 数据处理 converter = TextConverter(train_files, save_file, max_vocab=Config.vocab_size, seq_length=Config.seq_length) print('vocab size:', converter.vocab_size) print('labels:', converter.label) test_texts, test_labels = converter.load_data(test_files) test_x, test_x_len, test_y = converter.texts_to_arr( test_texts, test_labels) test_g = converter.val_samples_generator(test_x, test_x_len, test_y, Config.batch_size) model = Model(Config) # 加载上一次保存的模型 checkpoint_path = tf.train.latest_checkpoint(model_path) if checkpoint_path: model.load(checkpoint_path) print('start to testing...') model.test(test_g)