def run_main(): """ 主函数 """ # 1. 数据读取,处理,清洗,准备 if is_first_run: print('处理清洗文本数据中...', end=' ') # 如果是第一次运行需要对原始文本数据进行处理和清洗 # 读取原始文本数据,将标签和文本数据保存成csv read_and_save_to_csv() # 读取处理好的csv文件,构造数据集 text_df = pd.read_csv(os.path.join(dataset_path, output_text_filename), encoding='utf-8') # 处理文本数据 text_df['text'] = text_df['text'].apply(proc_text) # 过滤空字符串,去掉所有空行部分 text_df = text_df[text_df['text'] != ''] # 保存处理好的文本数据,文本预处理结束 text_df.to_csv(os.path.join(dataset_path, output_cln_text_filename), index=None, encoding='utf-8') print('完成,并保存结果。') # 2. 分割训练集、测试集 print('加载处理好的文本数据') clean_text_df = pd.read_csv(os.path.join(dataset_path, output_cln_text_filename), encoding='utf-8') # 分割训练集和测试集 # 按每个情感值的80%做分割, train_text_df, test_text_df = split_train_test(clean_text_df) # 查看训练集测试集基本信息 print('训练集中各类的数据个数:', train_text_df.groupby('label').size()) print('测试集中各类的数据个数:', test_text_df.groupby('label').size()) # 3. 特征提取 # 计算词频 n_common_words = 200 # 将训练集中的单词拿出来统计词频 print('统计词频...') # 获取训练集数据集里所有的词语的列表 all_words_in_train = get_word_list_from_data(train_text_df) # 统计词频 fdisk = nltk.FreqDist(all_words_in_train) # 获取词频排名前200个的词语的词频 # 构建“常用单词列表” common_words_freqs = fdisk.most_common(n_common_words) print('出现最多的{}个词是:'.format(n_common_words)) for word, count in common_words_freqs: print('{}: {}次'.format(word, count)) print() # 在训练集上提取特征 # 将text部分转换为list做为参数 text_collection = TextCollection(train_text_df['text'].values.tolist()) # 提取训练样本和测试样本的特征 # _X 表示常用单词在每一行的tf-idf值,_y 表示情感值 print('训练样本提取特征...', end=' ') train_X, train_y = extract_feat_from_data(train_text_df, text_collection, common_words_freqs) print('完成') print() print('测试样本提取特征...', end=' ') test_X, test_y = extract_feat_from_data(test_text_df, text_collection, common_words_freqs) print('完成') # 4. 训练模型Naive Bayes print('训练模型...', end=' ') # 创建高斯朴素贝叶斯模型 gnb = GaussianNB() # 向模型加载训练集特征数据,训练模型, gnb.fit(train_X, train_y) print('完成') print() # 5. 预测 print('测试模型...', end=' ') # 加载测试集特征数据,用来预测数据。 test_pred = gnb.predict(test_X) # test_pred : ndarray : array([3., 3., 3., 2., 3., 3., 3., 0., 3., 3., 3., 2., 1. .....]) print('完成') # 输出准确率 print('准确率:', cal_acc(test_y, test_pred))
import tools import matplotlib.pyplot as plt import dataset_loader import cfg.config as config import torch cfg = config.get_cfg_defaults() val_data_flow = dataset_loader.Data_flow(5, cfg.TRAIN.raw_data_file, cfg.img_dir, [cfg.img_h, cfg.img_w], cfg.out_features, train=True) input, targets = val_data_flow.load_next_batch() tools.show_targets(input.cpu().permute(0, 2, 3, 1).numpy(), targets.cpu().numpy(), tools.get_preds(targets.numpy())) print( tools.cal_acc(tools.get_preds(targets.numpy()), tools.get_preds(targets.numpy())) + 10) # class struct: # def __init__(self): # self.x = [] # self.y = [] # # # epoch = struct() # train_l = struct() # val_l = struct() # train_a = struct() # val_a = struct() # time = struct() # with open('/home/agni/Downloads/train_log(1).txt') as f: # for i, line in enumerate(f):
def run_main(): """ 主函数 """ # 1. 数据读取,处理,清洗,准备 if is_first_run: print('处理清洗文本数据中...', end=' ') # 如果是第一次运行需要对原始文本数据进行处理和清洗 # 读取原始文本数据,将标签和文本数据保存成csv read_and_save_to_csv() # 读取处理好的csv文件,构造数据集 text_df = pd.read_csv(os.path.join(dataset_path, output_text_filename), encoding='utf-8') # 处理文本数据 text_df['text'] = text_df['text'].apply(proc_text) # 过滤空字符串 text_df = text_df[text_df['text'] != ''] # 保存处理好的文本数据 text_df.to_csv(os.path.join(dataset_path, output_cln_text_filename), index=None, encoding='utf-8') print('完成,并保存结果。') # 2. 分割训练集、测试集 print('加载处理好的文本数据') clean_text_df = pd.read_csv(os.path.join(dataset_path, output_cln_text_filename), encoding='utf-8') # 分割训练集和测试集 train_text_df, test_text_df = split_train_test(clean_text_df) # 查看训练集测试集基本信息 print('训练集中各类的数据个数:', train_text_df.groupby('label').size()) print('测试集中各类的数据个数:', test_text_df.groupby('label').size()) # 3. 特征提取 # 计算词频 n_common_words = 200 # 将训练集中的单词拿出来统计词频 print('统计词频...') all_words_in_train = get_word_list_from_data(train_text_df) fdisk = nltk.FreqDist(all_words_in_train) common_words_freqs = fdisk.most_common(n_common_words) print('出现最多的{}个词是:'.format(n_common_words)) for word, count in common_words_freqs: print('{}: {}次'.format(word, count)) print() # 在训练集上提取特征 text_collection = TextCollection(train_text_df['text'].values.tolist()) print('训练样本提取特征...', end=' ') train_X, train_y = extract_feat_from_data(train_text_df, text_collection, common_words_freqs) print('完成') print() print('测试样本提取特征...', end=' ') test_X, test_y = extract_feat_from_data(test_text_df, text_collection, common_words_freqs) print('完成') # 4. 训练模型Naive Bayes print('训练模型...', end=' ') gnb = GaussianNB() gnb.fit(train_X, train_y) print('完成') print() # 5. 预测 print('测试模型...', end=' ') test_pred = gnb.predict(test_X) print('完成') # 输出准确率 print('准确率:', cal_acc(test_y, test_pred))
def run_main(): """ 主函数 """ # 1. 数据读取,处理,清洗,准备 if is_first_run: print('处理清洗文本数据中...', end=' ') # 如果是第一次运行需要对原始文本数据进行处理和清洗 # 读取原始文本数据,将标签和文本数据保存成csv read_and_save_to_csv() # 读取处理好的csv文件,构造数据集 text_df = pd.read_csv(os.path.join(dataset_path, output_text_filename), encoding='utf-8') # 处理文本数据 text_df['text'] = text_df['text'].apply(proc_text) # 过滤空字符串,去掉所有空行部分 text_df = text_df[text_df['text'] != ''] # 保存处理好的文本数据,文本预处理结束 text_df.to_csv(os.path.join(dataset_path, output_cln_text_filename), index=None, encoding='utf-8') print('完成,并保存结果。') # 2. 分割训练集、测试集 print('加载处理好的文本数据') clean_text_df = pd.read_csv(os.path.join(dataset_path, output_cln_text_filename), encoding='utf-8') # 分割训练集和测试集 # 按每个情感值的80%做分割, train_text_df, test_text_df = split_train_test(clean_text_df) # 查看训练集测试集基本信息 print('训练集中各类的数据个数:', train_text_df.groupby('label').size()) print('测试集中各类的数据个数:', test_text_df.groupby('label').size()) # 3. 特征提取 # 计算词频 n_common_words = 1000 # 将训练集中的单词拿出来统计词频 #def count_tf(): # 获取训练集数据集里所有的词语的列表 all_words_in_train = get_word_list_from_data(train_text_df) print('统计词频...') print("总单词数",len(all_words_in_train)) # 统计词频 fdisk = nltk.FreqDist(all_words_in_train) print("词频",len(fdisk)) # 获取词频排名前200个的词语的词频 # 构建“常用单词列表” common_words_freqs = fdisk.most_common(n_common_words) print('出现最多的{}个词是:'.format(n_common_words)) for word, count in common_words_freqs: print('{}: {}次'.format(word, count)) print() # 在训练集上提取特征 # 将text部分转换为list做为参数 text_collection = TextCollection(train_text_df['text'].values.tolist()) # 提取训练样本和测试样本的特征 # _X 表示常用单词在每一行的tf-idf值,_y 表示情感值 print('训练样本提取特征...', end=' ') if load_np: train_X = np.load("train_x.npy") print(train_X.shape) train_X = train_X.reshape(train_X.shape[0],1,train_X.shape[1]) print(train_X.shape) train_y = np.load("train_y.npy") test_X = np.load("test_X.npy") test_X = test_X.reshape(test_X.shape[0],1,test_X.shape[1]) test_y = np.load("test_y.npy") else: train_X, train_y = extract_feat_from_data(train_text_df, text_collection, common_words_freqs) np.save("train_x.npy",train_X) np.save("train_y.npy",train_y) print('完成') print() print('测试样本提取特征...', end=' ') test_X, test_y = extract_feat_from_data(test_text_df, text_collection, common_words_freqs) np.save("test_X.npy",test_X) np.save("test_y.npy",test_y) print('完成') # 4. 训练模型Naive Bayes print('训练模型...', end=' ') # 创建高斯朴素贝叶斯模型 #gnb = GaussianNB() 0.29 gnb = LogisticRegression(multi_class="ovr") model = get_model(n_common_words) onehot_train_y = keras.utils.to_categorical(train_y, num_classes=4) onehot_test_y = keras.utils.to_categorical(test_y, num_classes=4) #model.fit(train_X, onehot_train_y, epochs=50,batch_size=128,verbose=1) #score = model.evaluate(test_X, onehot_test_y, batch_size=128) # 向模型加载训练集特征数据,训练模型, gnb.fit(train_X, train_y) model.save_weights("model.h5") print('完成') #print('score',score) # 5. 预测 print('测试模型...', end=' ') # 加载测试集特征数据,用来预测数据。 #text_pred = model.predict(test_X,128) test_pred = gnb.predict(test_X) # test_pred : ndarray : array([3., 3., 3., 2., 3., 3., 3., 0., 3., 3., 3., 2., 1. .....]) print('完成') # 输出准确率 print('准确率:', cal_acc(test_y, test_pred))