Ejemplo n.º 1
0
def using_iris():
    '''
    Shows how load_iris works
    '''
    # 1. Load the Iris dataset:
    features, targets, classes = load_iris()

    [n, f_dim] = features.shape

    print(f'* The dataset contains {n} samples')
    print(f'* Each sample has {f_dim} features')

    # 2. get the first datapoint
    first_feature_set = features[0, :]
    first_target = targets[0]

    print(f'* The first sample has the features:\n{first_feature_set} '+\
        f'and belongs to the class {first_target}')

    print('* Each datapoint can belong to any of the following classes:'+\
        f'\n{classes}')

    # 3. Split into train and test sets
    (train_features, train_targets), (test_features, test_targets) =\
        split_train_test(features, targets, train_ratio=0.9)

    train_n, test_n = train_features.shape[0], test_features.shape[0]
    print(f'* The train set contains {train_n} samples')
    print(f'* The test set contains {test_n} samples')
Ejemplo n.º 2
0
    def __init__(
        self,
        features: np.ndarray,
        targets: np.ndarray,
        classes: list = [0, 1, 2],
        train_ratio: float = 0.8
    ):
        '''
        train_ratio: The ratio of the Iris dataset that will
        be dedicated to training.
        '''
        (self.train_features, self.train_targets),\
            (self.test_features, self.test_targets) =\
            split_train_test(features, targets, train_ratio)

        self.classes = classes
        self.tree = DecisionTreeClassifier()
Ejemplo n.º 3
0
def main():
    """
        主函数
    """
    # Step 1: 处理数据集
    print('===Step1: 处理数据集===')

    if not os.path.exists(constant.cln_text_csv_file):
        print('清洗数据...')
        # 读取原始csv文件
        raw_text_df = pd.read_csv(constant.raw_text_csv_file)

        # 清洗原始数据
        cln_text_df = clean_text(raw_text_df)

        # 保存处理好的文本数据
        cln_text_df.to_csv(constant.cln_text_csv_file, index=None)
        print('完成,并保存结果至', constant.cln_text_csv_file)

    print('================\n')

    # Step 2. 查看整理好的数据集,并选取部分数据作为模型的训练
    print('===Step2. 查看数据集===')
    text_data = pd.read_csv(constant.cln_text_csv_file)
    text_data['date'] = pd.to_datetime(text_data['date'])
    text_data.set_index('date', inplace=True)
    print('各类样本数量:')
    print(text_data.groupby('label').size())

    # Step 3. 分割训练集和测试集
    print('===Step3. 分割训练集合测试集===')
    train_text_df, test_text_df = split_train_test(text_data)
    # 查看训练集测试集基本信息
    print('训练集中各类的数据个数:')
    print(train_text_df.groupby('label').size())
    print('测试集中各类的数据个数:')
    print(test_text_df.groupby('label').size())
    print('================\n')

    # Step 4. 特征提取
    print('===Step4. 文本特征提取===')
    # 计算词频
    n_common_words = 200

    # 将训练集中的单词拿出来统计词频
    print('统计词频...')
    all_words_in_train = get_word_list_from_data(train_text_df)
    fdisk = nltk.FreqDist(all_words_in_train)
    common_words_freqs = fdisk.most_common(n_common_words)
    print('出现最多的{}个词是:'.format(n_common_words))
    for word, count in common_words_freqs:
        print('{}: {}次'.format(word, count))
    print()

    # 在训练集上提取特征
    text_collection = TextCollection(train_text_df['text'].values.tolist())
    print('训练样本提取特征...')
    train_X, train_y = extract_feat_from_data(train_text_df, text_collection,
                                              common_words_freqs)
    print('完成')
    print()

    print('测试样本提取特征...')
    test_X, test_y = extract_feat_from_data(test_text_df, text_collection,
                                            common_words_freqs)
    print('完成')
    print('================\n')

    # 特征处理
    # 特征范围归一化
    scaler = StandardScaler()
    tr_feat_scaled = scaler.fit_transform(train_X)
    te_feat_scaled = scaler.transform(test_X)

    # 3.6 特征选择
    sel = VarianceThreshold(threshold=(.8 * (1 - .8)))
    tr_feat_scaled_sel = sel.fit_transform(tr_feat_scaled)
    te_feat_scaled_sel = sel.transform(te_feat_scaled)

    # 3.7 PCA降维操作
    pca = PCA(n_components=0.95)  # 保留95%贡献率的特征向量
    tr_feat_scaled_sel_pca = pca.fit_transform(tr_feat_scaled_sel)
    te_feat_scaled_sel_pca = pca.transform(te_feat_scaled_sel)
    print('特征处理结束')
    print('处理后每个样本特征维度:', tr_feat_scaled_sel_pca.shape[1])

    # Step 5. 训练模型
    models = []
    print('===Step5. 训练模型===')
    print('1. 朴素贝叶斯模型:')
    gnb_model = GaussianNB()
    gnb_model.fit(tr_feat_scaled_sel_pca, train_y)
    models.append(['朴素贝叶斯', gnb_model])
    print('完成')
    print()

    print('2. 逻辑回归:')
    lr_param_grid = [{'C': [1e-3, 1e-2, 1e-1, 1, 10, 100]}]
    lr_model = LogisticRegression()
    best_lr_model = get_best_model(lr_model,
                                   tr_feat_scaled_sel_pca,
                                   train_y,
                                   lr_param_grid,
                                   cv=3)
    models.append(['逻辑回归', best_lr_model])
    print('完成')
    print()

    print('3. 支持向量机:')
    svm_param_grid = [
        {
            'C': [1e-2, 1e-1, 1, 10, 100],
            'gamma': [0.001, 0.0001],
            'kernel': ['rbf']
        },
    ]
    svm_model = svm.SVC(probability=True)
    best_svm_model = get_best_model(svm_model,
                                    tr_feat_scaled_sel_pca,
                                    train_y,
                                    svm_param_grid,
                                    cv=3)
    models.append(['支持向量机', best_svm_model])
    print('完成')
    print()

    print('4. 随机森林:')
    rf_param_grid = [{'n_estimators': [10, 50, 100, 150, 200]}]

    rf_model = RandomForestClassifier()
    best_rf_model = get_best_model(rf_model,
                                   tr_feat_scaled_sel_pca,
                                   train_y,
                                   rf_param_grid,
                                   cv=3)
    rf_model.fit(tr_feat_scaled_sel_pca, train_y)
    models.append(['随机森林', best_rf_model])
    print('完成')
    print()

    # Step 6. 测试模型
    print('===Step6. 测试模型===')
    for i, model in enumerate(models):
        print('{}-{}'.format(i + 1, model[0]))
        # 输出准确率
        print('准确率:',
              accuracy_score(test_y, model[1].predict(te_feat_scaled_sel_pca)))
        print(
            'AUC:',
            roc_auc_score(test_y,
                          model[1].predict_proba(te_feat_scaled_sel_pca)[:,
                                                                         0]))
        # 输出混淆矩阵
        print('混淆矩阵')
        print(
            confusion_matrix(test_y, model[1].predict(te_feat_scaled_sel_pca)))
        print()
Ejemplo n.º 4
0
def run_main():
    # 加载清洗后的数据文件
    clean_text_df = pd.read_csv(os.path.join(dataset_path,
                                             output_cln_text_filename),
                                encoding='utf-8')
    # 划分训练集和测试集
    train, test = split_train_test(clean_text_df, 0.8)
    train_text_list = []
    train_label_list = []
    test_text_list = []
    test_label_list = []
    print('构建数据集中')
    for i, r_data in train.iterrows():
        train_text_list.append(r_data['text'])
        train_label_list.append(r_data['label'])
    for i, r_data in test.iterrows():
        test_text_list.append(r_data['text'])
        test_label_list.append(r_data['label'])

    # 打乱顺序
    c = list(zip(train_text_list, train_label_list))
    np.random.shuffle(c)
    train_text_list[:], train_label_list[:] = zip(*c)
    c = list(zip(test_text_list, test_label_list))
    np.random.shuffle(c)
    test_text_list[:], test_label_list[:] = zip(*c)

    print('构建词向量中')
    # 建立2000个单词的字典
    tokenizer = Tokenizer(num_words=4000)
    # 读取所有训练集文本,词频排序TOP2000会被列入字典
    tokenizer.fit_on_texts(train_text_list)
    # 将训练集和测试集文本转为数字序列
    x_train_seq = tokenizer.texts_to_sequences(train_text_list)
    x_test_seq = tokenizer.texts_to_sequences(test_text_list)
    # 截长补短
    x_train = sequence.pad_sequences(x_train_seq, maxlen=100)
    x_test = sequence.pad_sequences(x_test_seq, maxlen=100)

    y_train = train_label_list
    y_test = test_label_list

    print('构建LSTM模型中')
    model = Sequential()
    model.add(Embedding(4000, 200))
    model.add(
        LSTM(64,
             dropout=0.5,
             recurrent_dropout=0.5,
             kernel_regularizer=regularizers.l2(0.1)))
    model.add(Dense(1, activation='sigmoid'))
    batch_size = 64
    epochs = 50
    adam = optimizers.adam(lr=0.0001)

    model.compile(loss='binary_crossentropy',
                  optimizer=adam,
                  metrics=['accuracy'])

    # 获取当前时间
    startdate = datetime.datetime.now()
    # 当前时间转换为指定字符串格
    startdate = startdate.strftime("%Y-%m-%d %H:%M:%S")

    history = model.fit(x_train,
                        y_train,
                        validation_split=0.1,
                        batch_size=batch_size,
                        epochs=epochs,
                        shuffle=True)

    enddate = datetime.datetime.now()
    enddate = enddate.strftime("%Y-%m-%d %H:%M:%S")
    print('LSTM训练时长', subtime(startdate, enddate))

    model.save('lstm_finally.h5')

    # 绘制训练 & 验证的准确率值
    plt.plot(history.history['acc'])
    plt.plot(history.history['val_acc'])
    plt.title('Model accuracy')
    plt.ylabel('Accuracy')
    plt.xlabel('Epoch')
    plt.legend(['Train', 'Test'], loc='upper left')
    plt.show()

    # 绘制训练 & 验证的损失值
    plt.plot(history.history['loss'])
    plt.plot(history.history['val_loss'])
    plt.title('Model loss')
    plt.ylabel('Loss')
    plt.xlabel('Epoch')
    plt.legend(['Train', 'Test'], loc='upper left')
    plt.show()

    scores = model.evaluate(x_test, y_test)
    print('test_loss: %f, accuracy: %f' % (scores[0], scores[1]))
    plot_model(model, to_file='model.png')
Ejemplo n.º 5
0
def run_main():
    """
        主函数
    """
    # 1. 数据读取,处理,清洗,准备
    if is_first_run:
        print('处理清洗文本数据中...', end=' ')
        # 如果是第一次运行需要对原始文本数据进行处理和清洗

        # 读取原始文本数据,将标签和文本数据保存成csv
        read_and_save_to_csv()

        # 读取处理好的csv文件,构造数据集
        text_df = pd.read_csv(os.path.join(dataset_path, output_text_filename),
                              encoding='utf-8')

        # 处理文本数据
        text_df['text'] = text_df['text'].apply(proc_text)

        # 过滤空字符串,去掉所有空行部分
        text_df = text_df[text_df['text'] != '']

        # 保存处理好的文本数据,文本预处理结束
        text_df.to_csv(os.path.join(dataset_path, output_cln_text_filename),
                       index=None,
                       encoding='utf-8')
        print('完成,并保存结果。')

    # 2. 分割训练集、测试集
    print('加载处理好的文本数据')
    clean_text_df = pd.read_csv(os.path.join(dataset_path,
                                             output_cln_text_filename),
                                encoding='utf-8')
    # 分割训练集和测试集
    # 按每个情感值的80%做分割,
    train_text_df, test_text_df = split_train_test(clean_text_df)
    # 查看训练集测试集基本信息
    print('训练集中各类的数据个数:', train_text_df.groupby('label').size())
    print('测试集中各类的数据个数:', test_text_df.groupby('label').size())

    # 3. 特征提取
    # 计算词频
    n_common_words = 200

    # 将训练集中的单词拿出来统计词频
    print('统计词频...')

    # 获取训练集数据集里所有的词语的列表
    all_words_in_train = get_word_list_from_data(train_text_df)
    # 统计词频
    fdisk = nltk.FreqDist(all_words_in_train)

    # 获取词频排名前200个的词语的词频
    # 构建“常用单词列表”
    common_words_freqs = fdisk.most_common(n_common_words)
    print('出现最多的{}个词是:'.format(n_common_words))

    for word, count in common_words_freqs:
        print('{}: {}次'.format(word, count))
    print()

    # 在训练集上提取特征
    # 将text部分转换为list做为参数
    text_collection = TextCollection(train_text_df['text'].values.tolist())

    # 提取训练样本和测试样本的特征
    # _X 表示常用单词在每一行的tf-idf值,_y 表示情感值
    print('训练样本提取特征...', end=' ')
    train_X, train_y = extract_feat_from_data(train_text_df, text_collection,
                                              common_words_freqs)
    print('完成')
    print()

    print('测试样本提取特征...', end=' ')
    test_X, test_y = extract_feat_from_data(test_text_df, text_collection,
                                            common_words_freqs)
    print('完成')

    # 4. 训练模型Naive Bayes
    print('训练模型...', end=' ')
    # 创建高斯朴素贝叶斯模型
    gnb = GaussianNB()
    # 向模型加载训练集特征数据,训练模型,
    gnb.fit(train_X, train_y)
    print('完成')
    print()

    # 5. 预测
    print('测试模型...', end=' ')
    # 加载测试集特征数据,用来预测数据。
    test_pred = gnb.predict(test_X)
    # test_pred : ndarray : array([3., 3., 3., 2., 3., 3., 3., 0., 3., 3., 3., 2., 1. .....])

    print('完成')

    # 输出准确率
    print('准确率:', cal_acc(test_y, test_pred))
Ejemplo n.º 6
0
def run_main():
    """
        主函数
    """
    # 1. 数据读取,处理,清洗,准备
    if is_first_run:
        print('处理清洗文本数据中...', end=' ')
        # 如果是第一次运行需要对原始文本数据进行处理和清洗

        # 读取原始文本数据,将标签和文本数据保存成csv
        read_and_save_to_csv()

        # 读取处理好的csv文件,构造数据集
        text_df = pd.read_csv(os.path.join(dataset_path, output_text_filename),
                              encoding='utf-8')

        # 处理文本数据
        text_df['text'] = text_df['text'].apply(proc_text)

        # 过滤空字符串
        text_df = text_df[text_df['text'] != '']

        # 保存处理好的文本数据
        text_df.to_csv(os.path.join(dataset_path, output_cln_text_filename),
                       index=None, encoding='utf-8')
        print('完成,并保存结果。')

    # 2. 分割训练集、测试集
    print('加载处理好的文本数据')
    clean_text_df = pd.read_csv(os.path.join(dataset_path, output_cln_text_filename),
                                encoding='utf-8')
    # 分割训练集和测试集
    train_text_df, test_text_df = split_train_test(clean_text_df)
    # 查看训练集测试集基本信息
    print('训练集中各类的数据个数:', train_text_df.groupby('label').size())
    print('测试集中各类的数据个数:', test_text_df.groupby('label').size())

    # 3. 特征提取
    # 计算词频
    n_common_words = 200

    # 将训练集中的单词拿出来统计词频
    print('统计词频...')
    all_words_in_train = get_word_list_from_data(train_text_df)
    fdisk = nltk.FreqDist(all_words_in_train)
    common_words_freqs = fdisk.most_common(n_common_words)
    print('出现最多的{}个词是:'.format(n_common_words))
    for word, count in common_words_freqs:
        print('{}: {}次'.format(word, count))
    print()

    # 在训练集上提取特征
    text_collection = TextCollection(train_text_df['text'].values.tolist())
    print('训练样本提取特征...', end=' ')
    train_X, train_y = extract_feat_from_data(train_text_df, text_collection, common_words_freqs)
    print('完成')
    print()

    print('测试样本提取特征...', end=' ')
    test_X, test_y = extract_feat_from_data(test_text_df, text_collection, common_words_freqs)
    print('完成')

    # 4. 训练模型Naive Bayes
    print('训练模型...', end=' ')
    gnb = GaussianNB()
    gnb.fit(train_X, train_y)
    print('完成')
    print()

    # 5. 预测
    print('测试模型...', end=' ')
    test_pred = gnb.predict(test_X)
    print('完成')

    # 输出准确率
    print('准确率:', cal_acc(test_y, test_pred))
Ejemplo n.º 7
0
def run_main():
    # 删除行
    '''
    text_df = pd.read_csv(os.path.join(dataset_path, output_text_filename),
                      encoding='utf-8')
    df = text_df.drop(labels=range(106728,159814),axis=0)
    df.to_csv(os.path.join(dataset_path, output_cln_text_filename),
                   index=None, encoding='utf-8')
    '''
    # 修改分类
    '''
    text_df = pd.read_csv(os.path.join(dataset_path, output_cln_text_filename),
                         encoding='utf-8')
    text_df['label'].replace(4,0,inplace=True)
    text_df.to_csv(os.path.join(dataset_path, output_cln_text_filename),
                   index=None, encoding='utf-8')


    # 输出去停用词前数据
    text_df = pd.read_csv(os.path.join(dataset_path, output_cln_text_filename),
                          encoding='utf-8')
    print(text_df.head(5))
    '''
    '''
    # 去停用词
    text_df = pd.read_csv(os.path.join(dataset_path, output_cln_text_filename),
                          encoding='utf-8')
    text_df['text'] = text_df['text'].apply(proc_text)

    # 过滤空字符串,去掉所有空行部分
    text_df = text_df[text_df['text'] != '']

    # 保存处理好的文本数据,文本预处理结束
    text_df.to_csv(os.path.join(dataset_path, output_cln_text_filename),
                   index=None, encoding='utf-8')
    print(text_df.head(5))
    print('完成,并保存结果。')
    '''

    # 训练集划分------------------------------------------------------------
    print('加载处理好的文本数据')
    clean_text_df = pd.read_csv(os.path.join(dataset_path,
                                             output_cln_text_filename),
                                encoding='utf-8')
    # 分割训练集和测试集
    # 按每个情感值的80%做分割,
    train_text_df, test_text_df = split_train_test(clean_text_df)
    # 查看训练集测试集基本信息
    print('训练集中各类的数据个数:', train_text_df.groupby('label').size())
    print('测试集中各类的数据个数:', test_text_df.groupby('label').size())
    # -------------------------------------------------------------------------------

    # 构建词袋模型----------------------------------------------------------------------------
    clean_text_df = pd.read_csv(os.path.join(dataset_path,
                                             output_cln_text_filename),
                                encoding='utf-8')
    train_text_df, test_text_df = split_train_test(clean_text_df)
    # 计算词频
    n_common_words = 2000
    # 将训练集中的单词拿出来统计词频
    print('统计词频...')
    # 获取训练集数据集里所有的词语的列表
    all_words_in_train = get_word_list_from_data(train_text_df)
    print(all_words_in_train)
    # 统计词频
    fdisk = nltk.FreqDist(all_words_in_train)
    # fdisk.plot(5000)
    # 获取词频排名前300个的词语的词频
    # 构建“常用单词列表”
    common_words_freqs = fdisk.most_common(n_common_words)
    print('出现最多的{}个词是:'.format(n_common_words))
    for word, count in common_words_freqs:
        print('{}: {}次'.format(word, count))
    print()
    # 在训练集上提取特征
    # 将text部分转换为list做为参数
    text_collection = TextCollection(train_text_df['text'].values.tolist())
    # 提取训练样本和测试样本的特征
    # _X 表示常用单词在每一行的tf-idf值,_y 表示情感值
    print('训练样本提取特征...', end=' ')
    train_X, train_y = extract_feat_from_data(train_text_df, text_collection,
                                              common_words_freqs)
    print('完成')
    print()

    print('测试样本提取特征...', end=' ')
    test_X, test_y = extract_feat_from_data(test_text_df, text_collection,
                                            common_words_freqs)
    print('完成')
    # -------------------------------------------------------------------------------

    # 高斯贝叶斯模型---------------------------------------------------------------------------------
    print('训练模型...', end=' ')
    # 创建高斯朴素贝叶斯模型
    gnb = GaussianNB()

    # 向模型加载训练集特征数据,训练模型,
    gnb.fit(train_X, train_y)
    # 保存模型相关数据
    joblib.dump(gnb, 'NaiveBayes_2000.pkl')
    joblib.dump(text_collection, 'NB_text_collection_2000.pkl')
    joblib.dump(common_words_freqs, 'NB_common_words_2000.pkl')

    print('完成')
    print()
    # ---------------------------------------------------------------------------------

    # 模型评估---------------------------------------------------------------------------------
    test_pred = gnb.predict(test_X)
    print('准确率:', cal_acc(test_y, test_pred))
    # ---------------------------------------------------------------------------------
    predict.NB_predict('电池挺好的,但是续航不行')
Ejemplo n.º 8
0
def run_main():
    """
        主函数
    """
    # 1. 数据读取,处理,清洗,准备
    if is_first_run:
        print('处理清洗文本数据中...', end=' ')
        # 如果是第一次运行需要对原始文本数据进行处理和清洗

        # 读取原始文本数据,将标签和文本数据保存成csv
        read_and_save_to_csv()

        # 读取处理好的csv文件,构造数据集
        text_df = pd.read_csv(os.path.join(dataset_path, output_text_filename),
                              encoding='utf-8')

        # 处理文本数据
        text_df['text'] = text_df['text'].apply(proc_text)

        # 过滤空字符串,去掉所有空行部分
        text_df = text_df[text_df['text'] != '']

        # 保存处理好的文本数据,文本预处理结束
        text_df.to_csv(os.path.join(dataset_path, output_cln_text_filename),
                       index=None, encoding='utf-8')
        print('完成,并保存结果。')



    # 2. 分割训练集、测试集
    print('加载处理好的文本数据')
    clean_text_df = pd.read_csv(os.path.join(dataset_path, output_cln_text_filename),
                                encoding='utf-8')
    # 分割训练集和测试集
    # 按每个情感值的80%做分割,
    train_text_df, test_text_df = split_train_test(clean_text_df)
    # 查看训练集测试集基本信息
    print('训练集中各类的数据个数:', train_text_df.groupby('label').size())
    print('测试集中各类的数据个数:', test_text_df.groupby('label').size())


    # 3. 特征提取
    # 计算词频
    n_common_words = 1000

    # 将训练集中的单词拿出来统计词频
    #def count_tf():

    # 获取训练集数据集里所有的词语的列表

    all_words_in_train = get_word_list_from_data(train_text_df)
    print('统计词频...')
    print("总单词数",len(all_words_in_train))

    # 统计词频
    fdisk = nltk.FreqDist(all_words_in_train)
    print("词频",len(fdisk))
    # 获取词频排名前200个的词语的词频
    # 构建“常用单词列表”
    common_words_freqs = fdisk.most_common(n_common_words)
    print('出现最多的{}个词是:'.format(n_common_words))

    for word, count in common_words_freqs:
        print('{}: {}次'.format(word, count))
    print()

    # 在训练集上提取特征
    # 将text部分转换为list做为参数
    text_collection = TextCollection(train_text_df['text'].values.tolist())

    # 提取训练样本和测试样本的特征
    # _X 表示常用单词在每一行的tf-idf值,_y 表示情感值
    print('训练样本提取特征...', end=' ')
    if load_np:
        train_X = np.load("train_x.npy")
        print(train_X.shape)
        train_X = train_X.reshape(train_X.shape[0],1,train_X.shape[1])
        print(train_X.shape)
        train_y = np.load("train_y.npy")
        test_X = np.load("test_X.npy")
        test_X = test_X.reshape(test_X.shape[0],1,test_X.shape[1])
        test_y = np.load("test_y.npy")
    else:
        train_X, train_y = extract_feat_from_data(train_text_df, text_collection, common_words_freqs)
        np.save("train_x.npy",train_X)
        np.save("train_y.npy",train_y)
        print('完成')
        print()

        print('测试样本提取特征...', end=' ')
        test_X, test_y = extract_feat_from_data(test_text_df, text_collection, common_words_freqs)
        np.save("test_X.npy",test_X)
        np.save("test_y.npy",test_y)
        print('完成')

    # 4. 训练模型Naive Bayes
    print('训练模型...', end=' ')
    # 创建高斯朴素贝叶斯模型
	#gnb = GaussianNB() 0.29
    gnb = LogisticRegression(multi_class="ovr")
    model = get_model(n_common_words)
    onehot_train_y =  keras.utils.to_categorical(train_y, num_classes=4)
    onehot_test_y =  keras.utils.to_categorical(test_y, num_classes=4)

    #model.fit(train_X, onehot_train_y,            epochs=50,batch_size=128,verbose=1)
    #score = model.evaluate(test_X, onehot_test_y, batch_size=128)
    # 向模型加载训练集特征数据,训练模型,
    gnb.fit(train_X, train_y)
    model.save_weights("model.h5")
    print('完成')
    #print('score',score)

    # 5. 预测
    print('测试模型...', end=' ')
    # 加载测试集特征数据,用来预测数据。
    #text_pred = model.predict(test_X,128)
    test_pred = gnb.predict(test_X)
    # test_pred : ndarray : array([3., 3., 3., 2., 3., 3., 3., 0., 3., 3., 3., 2., 1. .....])

    print('完成')

    # 输出准确率
    print('准确率:', cal_acc(test_y, test_pred))
import pandas as pd
from sklearn import linear_model
import tools
import os

DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml/master/"
HOUSING_PATH = os.path.join("datasets", "housing")
HOUSING_URL = DOWNLOAD_ROOT + "datasets/housing/housing.tgz"

tools.fetch_data(HOUSING_URL, HOUSING_PATH)

DATA_PATH = os.path.join(HOUSING_PATH, 'housing.csv')
housing_data = pd.read_csv(DATA_PATH)
# print(housing_data.head())

tools.split_train_test(housing_data, 0.2, HOUSING_PATH)

TRAIN_PATH = os.path.join(HOUSING_PATH, 'train_data.csv')

train_data = pd.read_csv(TRAIN_PATH)

corr_matrix = train_data.corr()
print(corr_matrix)

# Prepare the data
X_median_income = np.c_[train_data["median_income"]]
Y_total_rooms = np.c_[train_data["total_rooms"]]

print(train_data.head())

# Visualize the data
Ejemplo n.º 10
0
def run_main():
    # 加载清洗后的数据文件
    clean_text_df = pd.read_csv(os.path.join(dataset_path,
                                             output_cln_text_filename),
                                encoding='utf-8')
    # 划分训练集和测试集
    train, test = split_train_test(clean_text_df, 0.8)
    train_text_list = []
    train_label_list = []
    test_text_list = []
    test_label_list = []
    print('构建数据集中')
    for i, r_data in train.iterrows():
        train_text_list.append(r_data['text'])
        train_label_list.append(r_data['label'])
    for i, r_data in test.iterrows():
        test_text_list.append(r_data['text'])
        test_label_list.append(r_data['label'])

    print(train_text_list)
    print(type(train_text_list))
    print(train_text_list[0])

    # 打乱顺序
    c = list(zip(train_text_list, train_label_list))
    np.random.shuffle(c)
    train_text_list[:], train_label_list[:] = zip(*c)
    c = list(zip(test_text_list, test_label_list))
    np.random.shuffle(c)
    test_text_list[:], test_label_list[:] = zip(*c)

    print('构建词向量中')
    # 建立2000个单词的字典
    tokenizer = Tokenizer(num_words=10000)
    # 读取所有训练集文本,词频排序TOP2000会被列入字典
    tokenizer.fit_on_texts(train_text_list)
    joblib.dump(tokenizer, 'tokenizer.pkl')
    #
    # 将训练集和测试集文本转为数字序列
    x_train_seq = tokenizer.texts_to_sequences(train_text_list)
    x_test_seq = tokenizer.texts_to_sequences(test_text_list)
    # 截长补短
    x_train = sequence.pad_sequences(x_train_seq, maxlen=100)
    x_test = sequence.pad_sequences(x_test_seq, maxlen=100)
    y_train = np.array(train_label_list).reshape(-1, 1)
    y_test = np.array(test_label_list).reshape(-1, 1)
    # print('数字序列:{0}'.format(x_train_seq))
    # print('数字序列类型:{0}'.format(type(x_train_seq)))
    # print('截断后:{0}'.format(x_train))
    # print('类型:{0}'.format(type(x_train)))
    # print(x_train_seq[0])
    # print(x_train[0])

    print('构建TEXT-CNN模型中')
    model = text_cnn()
    batch_size = 64
    epochs = 60
    # 获取当前时间
    startdate = datetime.datetime.now()
    # 当前时间转换为指定字符串格
    startdate = startdate.strftime("%Y-%m-%d %H:%M:%S")
    history = model.fit(x_train,
                        y_train,
                        validation_split=0.25,
                        batch_size=batch_size,
                        epochs=epochs,
                        shuffle=True)
    enddate = datetime.datetime.now()
    enddate = enddate.strftime("%Y-%m-%d %H:%M:%S")
    print('训练时长', subtime(startdate, enddate))
    model.save('text_cnn_alter.h5')
    # 绘制训练 & 验证的准确率值
    plt.plot(history.history['acc'])
    plt.plot(history.history['val_acc'])
    plt.title('Model accuracy')
    plt.ylabel('Accuracy')
    plt.xlabel('Epoch')
    plt.legend(['Train', 'Test'], loc='upper left')
    plt.show()

    # 绘制训练 & 验证的损失值
    plt.plot(history.history['loss'])
    plt.plot(history.history['val_loss'])
    plt.title('Model loss')
    plt.ylabel('Loss')
    plt.xlabel('Epoch')
    plt.legend(['Train', 'Test'], loc='upper left')
    plt.show()

    model = load_model('text_cnn_finally.h5')
    scores = model.evaluate(x_test, y_test)
    print('test_loss: %f, accuracy: %f' % (scores[0], scores[1]))
Ejemplo n.º 11
0
Archivo: lr.py Proyecto: bbz6810/myNlp
        return sigmoid(wx)


if __name__ == '__main__':
    from machine_learning.textCategory.category_tfidf import Category

    category = Category()
    category.load()
    labels = []
    for i in category.classifier.labels:
        if i == 'edu':
            labels.append(1)
        else:
            labels.append(0)
    # x, y = load_data()
    lr = LR()
    # s1, s2 = lr.add_b(x, y)
    s1, s2 = lr.add_b(category.classifier.doc_vector, labels)

    x_train, y_train, x_test, y_test = split_train_test(s1, s2)

    lr.train(x_train, y_train)

    d = lr.predict(x_test)
    s = 0
    for i in range(len(d)):
        t = 1 if d[i] > 0.5 else 0
        if t == y_test[i]:
            s += 1
    print(s / len(y_test))