Exemple #1
0
def sklearn_supervised(language='English',
                       model_exist=False,
                       model_path=None,
                       model_name='SVM',
                       vector=True,
                       hashmodel='CountVectorizer',
                       savemodel=False,
                       train_dataset=None,
                       test_data=None):
    '''
    :param language: 语种,中文将调jieba先分词
    :param model_exist: 模型是否存在
    :param model_path: 模型路径
    :param model_name: 机器学习分类模型,SVM,KNN,Logistic
    :param hashmodel: 哈希方式:CountVectorizer,TfidfTransformer,HashingVectorizer
    :param savemodel: 保存模型
    :param train_dataset: 训练集[[数据],[标签]]
    :param test_data: 测试集[数据]
    :param return: 预测结果的数组
    '''
    if vector == True:
        train_data_transform, test_data_transform = sentence_2_vec(
            train_data=train_dataset[0],
            test_data=test_data,
            size=50,
            window=5,
            min_count=1)
        train_data_transform = [sum(i) / len(i) for i in train_data_transform]
        test_data_transform = [sum(i) / len(i) for i in test_data_transform]
    else:
        train_data_transform, test_data_transform = sentence_2_sparse(
            train_data=train_dataset[0],
            test_data=test_data,
            language=language,
            hash=True,
            hashmodel=hashmodel)
    train_label = train_dataset[1]
    model_path = model_path
    if model_exist == False:  # 如果不存在模型,调训练集训练
        model_name = model_name
        if model_name == 'KNN':
            model = KNeighborsClassifier(n_neighbors=min(len(train_label),
                                                         5))  # 调用KNN,近邻=5
            model.fit(train_data_transform, train_label)
        elif model_name == 'SVM':
            model = SVC(kernel='linear', C=1.0)  # 核函数为线性,惩罚系数为1
            model.fit(train_data_transform, train_label)
        elif model_name == 'Logistic':
            model = LogisticRegression(solver='liblinear',
                                       C=1.0)  # 核函数为线性,惩罚系数为1
            model.fit(train_data_transform, train_label)

        if savemodel == True:
            joblib.dump(model, model_path)  # 保存模型
    else:  # 存在模型则直接调用
        model = joblib.load(model_path)
    result = model.predict(test_data_transform)  # 对测试集进行预测
    return result
Exemple #2
0
from sentence_transform.sentence_2_vec import sentence_2_vec
from models.neural_Conv1D import neural_Conv1D
from models.keras_log_plot import keras_log_plot

positive = pd.read_excel(
    'D:/github/Text-Classification/data/demo_score/data.xlsx',
    sheet_name='positive')
negative = pd.read_excel(
    'D:/github/Text-Classification/data/demo_score/data.xlsx',
    sheet_name='negative')
# 分隔训练集和测试集
total = pd.concat([positive, negative], axis=0)
# 转词向量
data_transform = sentence_2_vec(train_data=total.loc[:, 'evaluation'],
                                test_data=None,
                                size=5,
                                window=5,
                                min_count=1)
# 将不同长度的文本进行'截断/填充'至相同长度,不设置maxlen则填充至最长
data_transform = pad_sequences(data_transform,
                               maxlen=None,
                               padding='post',
                               value=0,
                               dtype='float32')
label_transform = np.array(pd.get_dummies(total.loc[:, 'label']))
print(data_transform.shape)
# 拆分为训练集和测试集
train_data, test_data, train_label, test_label = train_test_split(
    data_transform, label_transform, test_size=0.33, random_state=42)
model = neural_Conv1D(input_shape=data_transform.shape[-2:],
                      net_conv_num=[32, 64, 128],