def using_iris(): ''' Shows how load_iris works ''' # 1. Load the Iris dataset: features, targets, classes = load_iris() [n, f_dim] = features.shape print(f'* The dataset contains {n} samples') print(f'* Each sample has {f_dim} features') # 2. get the first datapoint first_feature_set = features[0, :] first_target = targets[0] print(f'* The first sample has the features:\n{first_feature_set} '+\ f'and belongs to the class {first_target}') print('* Each datapoint can belong to any of the following classes:'+\ f'\n{classes}') # 3. Split into train and test sets (train_features, train_targets), (test_features, test_targets) =\ split_train_test(features, targets, train_ratio=0.9) train_n, test_n = train_features.shape[0], test_features.shape[0] print(f'* The train set contains {train_n} samples') print(f'* The test set contains {test_n} samples')
def __init__( self, features: np.ndarray, targets: np.ndarray, classes: list = [0, 1, 2], train_ratio: float = 0.8 ): ''' train_ratio: The ratio of the Iris dataset that will be dedicated to training. ''' (self.train_features, self.train_targets),\ (self.test_features, self.test_targets) =\ split_train_test(features, targets, train_ratio) self.classes = classes self.tree = DecisionTreeClassifier()
def main(): """ 主函数 """ # Step 1: 处理数据集 print('===Step1: 处理数据集===') if not os.path.exists(constant.cln_text_csv_file): print('清洗数据...') # 读取原始csv文件 raw_text_df = pd.read_csv(constant.raw_text_csv_file) # 清洗原始数据 cln_text_df = clean_text(raw_text_df) # 保存处理好的文本数据 cln_text_df.to_csv(constant.cln_text_csv_file, index=None) print('完成,并保存结果至', constant.cln_text_csv_file) print('================\n') # Step 2. 查看整理好的数据集,并选取部分数据作为模型的训练 print('===Step2. 查看数据集===') text_data = pd.read_csv(constant.cln_text_csv_file) text_data['date'] = pd.to_datetime(text_data['date']) text_data.set_index('date', inplace=True) print('各类样本数量:') print(text_data.groupby('label').size()) # Step 3. 分割训练集和测试集 print('===Step3. 分割训练集合测试集===') train_text_df, test_text_df = split_train_test(text_data) # 查看训练集测试集基本信息 print('训练集中各类的数据个数:') print(train_text_df.groupby('label').size()) print('测试集中各类的数据个数:') print(test_text_df.groupby('label').size()) print('================\n') # Step 4. 特征提取 print('===Step4. 文本特征提取===') # 计算词频 n_common_words = 200 # 将训练集中的单词拿出来统计词频 print('统计词频...') all_words_in_train = get_word_list_from_data(train_text_df) fdisk = nltk.FreqDist(all_words_in_train) common_words_freqs = fdisk.most_common(n_common_words) print('出现最多的{}个词是:'.format(n_common_words)) for word, count in common_words_freqs: print('{}: {}次'.format(word, count)) print() # 在训练集上提取特征 text_collection = TextCollection(train_text_df['text'].values.tolist()) print('训练样本提取特征...') train_X, train_y = extract_feat_from_data(train_text_df, text_collection, common_words_freqs) print('完成') print() print('测试样本提取特征...') test_X, test_y = extract_feat_from_data(test_text_df, text_collection, common_words_freqs) print('完成') print('================\n') # 特征处理 # 特征范围归一化 scaler = StandardScaler() tr_feat_scaled = scaler.fit_transform(train_X) te_feat_scaled = scaler.transform(test_X) # 3.6 特征选择 sel = VarianceThreshold(threshold=(.8 * (1 - .8))) tr_feat_scaled_sel = sel.fit_transform(tr_feat_scaled) te_feat_scaled_sel = sel.transform(te_feat_scaled) # 3.7 PCA降维操作 pca = PCA(n_components=0.95) # 保留95%贡献率的特征向量 tr_feat_scaled_sel_pca = pca.fit_transform(tr_feat_scaled_sel) te_feat_scaled_sel_pca = pca.transform(te_feat_scaled_sel) print('特征处理结束') print('处理后每个样本特征维度:', tr_feat_scaled_sel_pca.shape[1]) # Step 5. 训练模型 models = [] print('===Step5. 训练模型===') print('1. 朴素贝叶斯模型:') gnb_model = GaussianNB() gnb_model.fit(tr_feat_scaled_sel_pca, train_y) models.append(['朴素贝叶斯', gnb_model]) print('完成') print() print('2. 逻辑回归:') lr_param_grid = [{'C': [1e-3, 1e-2, 1e-1, 1, 10, 100]}] lr_model = LogisticRegression() best_lr_model = get_best_model(lr_model, tr_feat_scaled_sel_pca, train_y, lr_param_grid, cv=3) models.append(['逻辑回归', best_lr_model]) print('完成') print() print('3. 支持向量机:') svm_param_grid = [ { 'C': [1e-2, 1e-1, 1, 10, 100], 'gamma': [0.001, 0.0001], 'kernel': ['rbf'] }, ] svm_model = svm.SVC(probability=True) best_svm_model = get_best_model(svm_model, tr_feat_scaled_sel_pca, train_y, svm_param_grid, cv=3) models.append(['支持向量机', best_svm_model]) print('完成') print() print('4. 随机森林:') rf_param_grid = [{'n_estimators': [10, 50, 100, 150, 200]}] rf_model = RandomForestClassifier() best_rf_model = get_best_model(rf_model, tr_feat_scaled_sel_pca, train_y, rf_param_grid, cv=3) rf_model.fit(tr_feat_scaled_sel_pca, train_y) models.append(['随机森林', best_rf_model]) print('完成') print() # Step 6. 测试模型 print('===Step6. 测试模型===') for i, model in enumerate(models): print('{}-{}'.format(i + 1, model[0])) # 输出准确率 print('准确率:', accuracy_score(test_y, model[1].predict(te_feat_scaled_sel_pca))) print( 'AUC:', roc_auc_score(test_y, model[1].predict_proba(te_feat_scaled_sel_pca)[:, 0])) # 输出混淆矩阵 print('混淆矩阵') print( confusion_matrix(test_y, model[1].predict(te_feat_scaled_sel_pca))) print()
def run_main(): # 加载清洗后的数据文件 clean_text_df = pd.read_csv(os.path.join(dataset_path, output_cln_text_filename), encoding='utf-8') # 划分训练集和测试集 train, test = split_train_test(clean_text_df, 0.8) train_text_list = [] train_label_list = [] test_text_list = [] test_label_list = [] print('构建数据集中') for i, r_data in train.iterrows(): train_text_list.append(r_data['text']) train_label_list.append(r_data['label']) for i, r_data in test.iterrows(): test_text_list.append(r_data['text']) test_label_list.append(r_data['label']) # 打乱顺序 c = list(zip(train_text_list, train_label_list)) np.random.shuffle(c) train_text_list[:], train_label_list[:] = zip(*c) c = list(zip(test_text_list, test_label_list)) np.random.shuffle(c) test_text_list[:], test_label_list[:] = zip(*c) print('构建词向量中') # 建立2000个单词的字典 tokenizer = Tokenizer(num_words=4000) # 读取所有训练集文本,词频排序TOP2000会被列入字典 tokenizer.fit_on_texts(train_text_list) # 将训练集和测试集文本转为数字序列 x_train_seq = tokenizer.texts_to_sequences(train_text_list) x_test_seq = tokenizer.texts_to_sequences(test_text_list) # 截长补短 x_train = sequence.pad_sequences(x_train_seq, maxlen=100) x_test = sequence.pad_sequences(x_test_seq, maxlen=100) y_train = train_label_list y_test = test_label_list print('构建LSTM模型中') model = Sequential() model.add(Embedding(4000, 200)) model.add( LSTM(64, dropout=0.5, recurrent_dropout=0.5, kernel_regularizer=regularizers.l2(0.1))) model.add(Dense(1, activation='sigmoid')) batch_size = 64 epochs = 50 adam = optimizers.adam(lr=0.0001) model.compile(loss='binary_crossentropy', optimizer=adam, metrics=['accuracy']) # 获取当前时间 startdate = datetime.datetime.now() # 当前时间转换为指定字符串格 startdate = startdate.strftime("%Y-%m-%d %H:%M:%S") history = model.fit(x_train, y_train, validation_split=0.1, batch_size=batch_size, epochs=epochs, shuffle=True) enddate = datetime.datetime.now() enddate = enddate.strftime("%Y-%m-%d %H:%M:%S") print('LSTM训练时长', subtime(startdate, enddate)) model.save('lstm_finally.h5') # 绘制训练 & 验证的准确率值 plt.plot(history.history['acc']) plt.plot(history.history['val_acc']) plt.title('Model accuracy') plt.ylabel('Accuracy') plt.xlabel('Epoch') plt.legend(['Train', 'Test'], loc='upper left') plt.show() # 绘制训练 & 验证的损失值 plt.plot(history.history['loss']) plt.plot(history.history['val_loss']) plt.title('Model loss') plt.ylabel('Loss') plt.xlabel('Epoch') plt.legend(['Train', 'Test'], loc='upper left') plt.show() scores = model.evaluate(x_test, y_test) print('test_loss: %f, accuracy: %f' % (scores[0], scores[1])) plot_model(model, to_file='model.png')
def run_main(): """ 主函数 """ # 1. 数据读取,处理,清洗,准备 if is_first_run: print('处理清洗文本数据中...', end=' ') # 如果是第一次运行需要对原始文本数据进行处理和清洗 # 读取原始文本数据,将标签和文本数据保存成csv read_and_save_to_csv() # 读取处理好的csv文件,构造数据集 text_df = pd.read_csv(os.path.join(dataset_path, output_text_filename), encoding='utf-8') # 处理文本数据 text_df['text'] = text_df['text'].apply(proc_text) # 过滤空字符串,去掉所有空行部分 text_df = text_df[text_df['text'] != ''] # 保存处理好的文本数据,文本预处理结束 text_df.to_csv(os.path.join(dataset_path, output_cln_text_filename), index=None, encoding='utf-8') print('完成,并保存结果。') # 2. 分割训练集、测试集 print('加载处理好的文本数据') clean_text_df = pd.read_csv(os.path.join(dataset_path, output_cln_text_filename), encoding='utf-8') # 分割训练集和测试集 # 按每个情感值的80%做分割, train_text_df, test_text_df = split_train_test(clean_text_df) # 查看训练集测试集基本信息 print('训练集中各类的数据个数:', train_text_df.groupby('label').size()) print('测试集中各类的数据个数:', test_text_df.groupby('label').size()) # 3. 特征提取 # 计算词频 n_common_words = 200 # 将训练集中的单词拿出来统计词频 print('统计词频...') # 获取训练集数据集里所有的词语的列表 all_words_in_train = get_word_list_from_data(train_text_df) # 统计词频 fdisk = nltk.FreqDist(all_words_in_train) # 获取词频排名前200个的词语的词频 # 构建“常用单词列表” common_words_freqs = fdisk.most_common(n_common_words) print('出现最多的{}个词是:'.format(n_common_words)) for word, count in common_words_freqs: print('{}: {}次'.format(word, count)) print() # 在训练集上提取特征 # 将text部分转换为list做为参数 text_collection = TextCollection(train_text_df['text'].values.tolist()) # 提取训练样本和测试样本的特征 # _X 表示常用单词在每一行的tf-idf值,_y 表示情感值 print('训练样本提取特征...', end=' ') train_X, train_y = extract_feat_from_data(train_text_df, text_collection, common_words_freqs) print('完成') print() print('测试样本提取特征...', end=' ') test_X, test_y = extract_feat_from_data(test_text_df, text_collection, common_words_freqs) print('完成') # 4. 训练模型Naive Bayes print('训练模型...', end=' ') # 创建高斯朴素贝叶斯模型 gnb = GaussianNB() # 向模型加载训练集特征数据,训练模型, gnb.fit(train_X, train_y) print('完成') print() # 5. 预测 print('测试模型...', end=' ') # 加载测试集特征数据,用来预测数据。 test_pred = gnb.predict(test_X) # test_pred : ndarray : array([3., 3., 3., 2., 3., 3., 3., 0., 3., 3., 3., 2., 1. .....]) print('完成') # 输出准确率 print('准确率:', cal_acc(test_y, test_pred))
def run_main(): """ 主函数 """ # 1. 数据读取,处理,清洗,准备 if is_first_run: print('处理清洗文本数据中...', end=' ') # 如果是第一次运行需要对原始文本数据进行处理和清洗 # 读取原始文本数据,将标签和文本数据保存成csv read_and_save_to_csv() # 读取处理好的csv文件,构造数据集 text_df = pd.read_csv(os.path.join(dataset_path, output_text_filename), encoding='utf-8') # 处理文本数据 text_df['text'] = text_df['text'].apply(proc_text) # 过滤空字符串 text_df = text_df[text_df['text'] != ''] # 保存处理好的文本数据 text_df.to_csv(os.path.join(dataset_path, output_cln_text_filename), index=None, encoding='utf-8') print('完成,并保存结果。') # 2. 分割训练集、测试集 print('加载处理好的文本数据') clean_text_df = pd.read_csv(os.path.join(dataset_path, output_cln_text_filename), encoding='utf-8') # 分割训练集和测试集 train_text_df, test_text_df = split_train_test(clean_text_df) # 查看训练集测试集基本信息 print('训练集中各类的数据个数:', train_text_df.groupby('label').size()) print('测试集中各类的数据个数:', test_text_df.groupby('label').size()) # 3. 特征提取 # 计算词频 n_common_words = 200 # 将训练集中的单词拿出来统计词频 print('统计词频...') all_words_in_train = get_word_list_from_data(train_text_df) fdisk = nltk.FreqDist(all_words_in_train) common_words_freqs = fdisk.most_common(n_common_words) print('出现最多的{}个词是:'.format(n_common_words)) for word, count in common_words_freqs: print('{}: {}次'.format(word, count)) print() # 在训练集上提取特征 text_collection = TextCollection(train_text_df['text'].values.tolist()) print('训练样本提取特征...', end=' ') train_X, train_y = extract_feat_from_data(train_text_df, text_collection, common_words_freqs) print('完成') print() print('测试样本提取特征...', end=' ') test_X, test_y = extract_feat_from_data(test_text_df, text_collection, common_words_freqs) print('完成') # 4. 训练模型Naive Bayes print('训练模型...', end=' ') gnb = GaussianNB() gnb.fit(train_X, train_y) print('完成') print() # 5. 预测 print('测试模型...', end=' ') test_pred = gnb.predict(test_X) print('完成') # 输出准确率 print('准确率:', cal_acc(test_y, test_pred))
def run_main(): # 删除行 ''' text_df = pd.read_csv(os.path.join(dataset_path, output_text_filename), encoding='utf-8') df = text_df.drop(labels=range(106728,159814),axis=0) df.to_csv(os.path.join(dataset_path, output_cln_text_filename), index=None, encoding='utf-8') ''' # 修改分类 ''' text_df = pd.read_csv(os.path.join(dataset_path, output_cln_text_filename), encoding='utf-8') text_df['label'].replace(4,0,inplace=True) text_df.to_csv(os.path.join(dataset_path, output_cln_text_filename), index=None, encoding='utf-8') # 输出去停用词前数据 text_df = pd.read_csv(os.path.join(dataset_path, output_cln_text_filename), encoding='utf-8') print(text_df.head(5)) ''' ''' # 去停用词 text_df = pd.read_csv(os.path.join(dataset_path, output_cln_text_filename), encoding='utf-8') text_df['text'] = text_df['text'].apply(proc_text) # 过滤空字符串,去掉所有空行部分 text_df = text_df[text_df['text'] != ''] # 保存处理好的文本数据,文本预处理结束 text_df.to_csv(os.path.join(dataset_path, output_cln_text_filename), index=None, encoding='utf-8') print(text_df.head(5)) print('完成,并保存结果。') ''' # 训练集划分------------------------------------------------------------ print('加载处理好的文本数据') clean_text_df = pd.read_csv(os.path.join(dataset_path, output_cln_text_filename), encoding='utf-8') # 分割训练集和测试集 # 按每个情感值的80%做分割, train_text_df, test_text_df = split_train_test(clean_text_df) # 查看训练集测试集基本信息 print('训练集中各类的数据个数:', train_text_df.groupby('label').size()) print('测试集中各类的数据个数:', test_text_df.groupby('label').size()) # ------------------------------------------------------------------------------- # 构建词袋模型---------------------------------------------------------------------------- clean_text_df = pd.read_csv(os.path.join(dataset_path, output_cln_text_filename), encoding='utf-8') train_text_df, test_text_df = split_train_test(clean_text_df) # 计算词频 n_common_words = 2000 # 将训练集中的单词拿出来统计词频 print('统计词频...') # 获取训练集数据集里所有的词语的列表 all_words_in_train = get_word_list_from_data(train_text_df) print(all_words_in_train) # 统计词频 fdisk = nltk.FreqDist(all_words_in_train) # fdisk.plot(5000) # 获取词频排名前300个的词语的词频 # 构建“常用单词列表” common_words_freqs = fdisk.most_common(n_common_words) print('出现最多的{}个词是:'.format(n_common_words)) for word, count in common_words_freqs: print('{}: {}次'.format(word, count)) print() # 在训练集上提取特征 # 将text部分转换为list做为参数 text_collection = TextCollection(train_text_df['text'].values.tolist()) # 提取训练样本和测试样本的特征 # _X 表示常用单词在每一行的tf-idf值,_y 表示情感值 print('训练样本提取特征...', end=' ') train_X, train_y = extract_feat_from_data(train_text_df, text_collection, common_words_freqs) print('完成') print() print('测试样本提取特征...', end=' ') test_X, test_y = extract_feat_from_data(test_text_df, text_collection, common_words_freqs) print('完成') # ------------------------------------------------------------------------------- # 高斯贝叶斯模型--------------------------------------------------------------------------------- print('训练模型...', end=' ') # 创建高斯朴素贝叶斯模型 gnb = GaussianNB() # 向模型加载训练集特征数据,训练模型, gnb.fit(train_X, train_y) # 保存模型相关数据 joblib.dump(gnb, 'NaiveBayes_2000.pkl') joblib.dump(text_collection, 'NB_text_collection_2000.pkl') joblib.dump(common_words_freqs, 'NB_common_words_2000.pkl') print('完成') print() # --------------------------------------------------------------------------------- # 模型评估--------------------------------------------------------------------------------- test_pred = gnb.predict(test_X) print('准确率:', cal_acc(test_y, test_pred)) # --------------------------------------------------------------------------------- predict.NB_predict('电池挺好的,但是续航不行')
def run_main(): """ 主函数 """ # 1. 数据读取,处理,清洗,准备 if is_first_run: print('处理清洗文本数据中...', end=' ') # 如果是第一次运行需要对原始文本数据进行处理和清洗 # 读取原始文本数据,将标签和文本数据保存成csv read_and_save_to_csv() # 读取处理好的csv文件,构造数据集 text_df = pd.read_csv(os.path.join(dataset_path, output_text_filename), encoding='utf-8') # 处理文本数据 text_df['text'] = text_df['text'].apply(proc_text) # 过滤空字符串,去掉所有空行部分 text_df = text_df[text_df['text'] != ''] # 保存处理好的文本数据,文本预处理结束 text_df.to_csv(os.path.join(dataset_path, output_cln_text_filename), index=None, encoding='utf-8') print('完成,并保存结果。') # 2. 分割训练集、测试集 print('加载处理好的文本数据') clean_text_df = pd.read_csv(os.path.join(dataset_path, output_cln_text_filename), encoding='utf-8') # 分割训练集和测试集 # 按每个情感值的80%做分割, train_text_df, test_text_df = split_train_test(clean_text_df) # 查看训练集测试集基本信息 print('训练集中各类的数据个数:', train_text_df.groupby('label').size()) print('测试集中各类的数据个数:', test_text_df.groupby('label').size()) # 3. 特征提取 # 计算词频 n_common_words = 1000 # 将训练集中的单词拿出来统计词频 #def count_tf(): # 获取训练集数据集里所有的词语的列表 all_words_in_train = get_word_list_from_data(train_text_df) print('统计词频...') print("总单词数",len(all_words_in_train)) # 统计词频 fdisk = nltk.FreqDist(all_words_in_train) print("词频",len(fdisk)) # 获取词频排名前200个的词语的词频 # 构建“常用单词列表” common_words_freqs = fdisk.most_common(n_common_words) print('出现最多的{}个词是:'.format(n_common_words)) for word, count in common_words_freqs: print('{}: {}次'.format(word, count)) print() # 在训练集上提取特征 # 将text部分转换为list做为参数 text_collection = TextCollection(train_text_df['text'].values.tolist()) # 提取训练样本和测试样本的特征 # _X 表示常用单词在每一行的tf-idf值,_y 表示情感值 print('训练样本提取特征...', end=' ') if load_np: train_X = np.load("train_x.npy") print(train_X.shape) train_X = train_X.reshape(train_X.shape[0],1,train_X.shape[1]) print(train_X.shape) train_y = np.load("train_y.npy") test_X = np.load("test_X.npy") test_X = test_X.reshape(test_X.shape[0],1,test_X.shape[1]) test_y = np.load("test_y.npy") else: train_X, train_y = extract_feat_from_data(train_text_df, text_collection, common_words_freqs) np.save("train_x.npy",train_X) np.save("train_y.npy",train_y) print('完成') print() print('测试样本提取特征...', end=' ') test_X, test_y = extract_feat_from_data(test_text_df, text_collection, common_words_freqs) np.save("test_X.npy",test_X) np.save("test_y.npy",test_y) print('完成') # 4. 训练模型Naive Bayes print('训练模型...', end=' ') # 创建高斯朴素贝叶斯模型 #gnb = GaussianNB() 0.29 gnb = LogisticRegression(multi_class="ovr") model = get_model(n_common_words) onehot_train_y = keras.utils.to_categorical(train_y, num_classes=4) onehot_test_y = keras.utils.to_categorical(test_y, num_classes=4) #model.fit(train_X, onehot_train_y, epochs=50,batch_size=128,verbose=1) #score = model.evaluate(test_X, onehot_test_y, batch_size=128) # 向模型加载训练集特征数据,训练模型, gnb.fit(train_X, train_y) model.save_weights("model.h5") print('完成') #print('score',score) # 5. 预测 print('测试模型...', end=' ') # 加载测试集特征数据,用来预测数据。 #text_pred = model.predict(test_X,128) test_pred = gnb.predict(test_X) # test_pred : ndarray : array([3., 3., 3., 2., 3., 3., 3., 0., 3., 3., 3., 2., 1. .....]) print('完成') # 输出准确率 print('准确率:', cal_acc(test_y, test_pred))
import pandas as pd from sklearn import linear_model import tools import os DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml/master/" HOUSING_PATH = os.path.join("datasets", "housing") HOUSING_URL = DOWNLOAD_ROOT + "datasets/housing/housing.tgz" tools.fetch_data(HOUSING_URL, HOUSING_PATH) DATA_PATH = os.path.join(HOUSING_PATH, 'housing.csv') housing_data = pd.read_csv(DATA_PATH) # print(housing_data.head()) tools.split_train_test(housing_data, 0.2, HOUSING_PATH) TRAIN_PATH = os.path.join(HOUSING_PATH, 'train_data.csv') train_data = pd.read_csv(TRAIN_PATH) corr_matrix = train_data.corr() print(corr_matrix) # Prepare the data X_median_income = np.c_[train_data["median_income"]] Y_total_rooms = np.c_[train_data["total_rooms"]] print(train_data.head()) # Visualize the data
def run_main(): # 加载清洗后的数据文件 clean_text_df = pd.read_csv(os.path.join(dataset_path, output_cln_text_filename), encoding='utf-8') # 划分训练集和测试集 train, test = split_train_test(clean_text_df, 0.8) train_text_list = [] train_label_list = [] test_text_list = [] test_label_list = [] print('构建数据集中') for i, r_data in train.iterrows(): train_text_list.append(r_data['text']) train_label_list.append(r_data['label']) for i, r_data in test.iterrows(): test_text_list.append(r_data['text']) test_label_list.append(r_data['label']) print(train_text_list) print(type(train_text_list)) print(train_text_list[0]) # 打乱顺序 c = list(zip(train_text_list, train_label_list)) np.random.shuffle(c) train_text_list[:], train_label_list[:] = zip(*c) c = list(zip(test_text_list, test_label_list)) np.random.shuffle(c) test_text_list[:], test_label_list[:] = zip(*c) print('构建词向量中') # 建立2000个单词的字典 tokenizer = Tokenizer(num_words=10000) # 读取所有训练集文本,词频排序TOP2000会被列入字典 tokenizer.fit_on_texts(train_text_list) joblib.dump(tokenizer, 'tokenizer.pkl') # # 将训练集和测试集文本转为数字序列 x_train_seq = tokenizer.texts_to_sequences(train_text_list) x_test_seq = tokenizer.texts_to_sequences(test_text_list) # 截长补短 x_train = sequence.pad_sequences(x_train_seq, maxlen=100) x_test = sequence.pad_sequences(x_test_seq, maxlen=100) y_train = np.array(train_label_list).reshape(-1, 1) y_test = np.array(test_label_list).reshape(-1, 1) # print('数字序列:{0}'.format(x_train_seq)) # print('数字序列类型:{0}'.format(type(x_train_seq))) # print('截断后:{0}'.format(x_train)) # print('类型:{0}'.format(type(x_train))) # print(x_train_seq[0]) # print(x_train[0]) print('构建TEXT-CNN模型中') model = text_cnn() batch_size = 64 epochs = 60 # 获取当前时间 startdate = datetime.datetime.now() # 当前时间转换为指定字符串格 startdate = startdate.strftime("%Y-%m-%d %H:%M:%S") history = model.fit(x_train, y_train, validation_split=0.25, batch_size=batch_size, epochs=epochs, shuffle=True) enddate = datetime.datetime.now() enddate = enddate.strftime("%Y-%m-%d %H:%M:%S") print('训练时长', subtime(startdate, enddate)) model.save('text_cnn_alter.h5') # 绘制训练 & 验证的准确率值 plt.plot(history.history['acc']) plt.plot(history.history['val_acc']) plt.title('Model accuracy') plt.ylabel('Accuracy') plt.xlabel('Epoch') plt.legend(['Train', 'Test'], loc='upper left') plt.show() # 绘制训练 & 验证的损失值 plt.plot(history.history['loss']) plt.plot(history.history['val_loss']) plt.title('Model loss') plt.ylabel('Loss') plt.xlabel('Epoch') plt.legend(['Train', 'Test'], loc='upper left') plt.show() model = load_model('text_cnn_finally.h5') scores = model.evaluate(x_test, y_test) print('test_loss: %f, accuracy: %f' % (scores[0], scores[1]))
return sigmoid(wx) if __name__ == '__main__': from machine_learning.textCategory.category_tfidf import Category category = Category() category.load() labels = [] for i in category.classifier.labels: if i == 'edu': labels.append(1) else: labels.append(0) # x, y = load_data() lr = LR() # s1, s2 = lr.add_b(x, y) s1, s2 = lr.add_b(category.classifier.doc_vector, labels) x_train, y_train, x_test, y_test = split_train_test(s1, s2) lr.train(x_train, y_train) d = lr.predict(x_test) s = 0 for i in range(len(d)): t = 1 if d[i] > 0.5 else 0 if t == y_test[i]: s += 1 print(s / len(y_test))