Exemple #1
0
def predict(name, target):
    stopwords = helper.read_stopwords()
    features = to_sample(target, stopwords)
    classifier = helper.load(name + "_model.sav")
    result = classifier.predict([features])
    le: LabelEncoder = helper.load(name + "_label.sav")
    result = le.inverse_transform(result)
    print(result)
Exemple #2
0
def train(name):
    stopwords = helper.read_stopwords()
    (feature_list, label_list) = read_dataset(name, stopwords)
    le = LabelEncoder()
    train_label = le.fit_transform(label_list)
    helper.dump(le, name + "_label.sav")
    classifier = Pipeline([('vectorizer', CountVectorizer()),
                           ('tfidf', TfidfTransformer()),
                           ('clf', OneVsRestClassifier(LinearSVC()))])
    classifier.fit(feature_list, train_label)
    helper.dump(classifier, name + "_model.sav")
    score = classifier.score(feature_list, train_label)
    print(score)
Exemple #3
0
def create_feature(name):
    time1 = DateUtils.current_time_millis()
    feature_list, label_list = read_dataset(name)
    time2 = DateUtils.current_time_millis()
    print("LOAD COMPLETE:{}ms".format(time2 - time1))
    feature_list = np.array(feature_list)
    stop_words = helper.read_stopwords()
    union = FeatureUnion(
        transformer_list=[
            ("feature",
             Pipeline([('selector', ItemSelector(1)),
                       ("dvec", DictVectorizer(sparse=False))])),
            (
                "content",
                Pipeline([
                    ('selector', ItemSelector(0)),
                    (
                        'cvec',
                        CountVectorizer(
                            # analyzer='char_wb',
                            token_pattern=r"(?u)\b\w+\b",
                            min_df=1,
                            stop_words=stop_words)),
                    ('tfidf', TfidfTransformer())
                ]))
        ],
        transformer_weights={
            "feature": 1.0,
            "content": 1.0
        })
    feature_list = union.fit_transform(feature_list)
    time1 = DateUtils.current_time_millis()
    print("TRANSFORM COMPLETE:{}ms".format(time1 - time2))
    # dvec: CountVectorizer = union.transformer_list[0][1].named_steps["dvec"]
    # helper.dump(dvec, name + "_dvec.sav")
    # cvec: CountVectorizer = union.transformer_list[1][1].named_steps["cvec"]
    # helper.dump(cvec, name + "_cvec.sav")
    helper.dump(union, name + "_vec.sav")
    time2 = DateUtils.current_time_millis()
    print("DUMP VECTOR COMPLETE:{}ms".format(time2 - time1))
    helper.dump(feature_list, name + "_data.sav")
    helper.dump(label_list, name + "_label.sav")
    time1 = DateUtils.current_time_millis()
    print("DUMP FEATURE COMPLETE:{}ms".format(time1 - time2))
Exemple #4
0
def train(name):
    stopwords = helper.read_stopwords()
    (feature_list, label_list, headers) = read_dataset(name, stopwords)
    feature_dict = transform(feature_list, headers)
    le = LabelEncoder()
    dummy_y = le.fit_transform(label_list)
    helper.dump(le, name + "_label.sav")
    # 流水线学习器
    classifier = Pipeline([
        # 并行处理
        ('union', FeatureUnion(
            transformer_list=[
                ('故障现象', Pipeline([
                    ('selector', ItemSelector(key='故障现象')),
                    # 词频特征提取
                    ('tfidf', TfidfVectorizer(analyzer='char_wb', token_pattern=r"(?u)\b\w+\b", min_df=1)),
                ])),
                ('原因分析', Pipeline([
                    ('selector', ItemSelector(key='原因分析')),
                    ('tfidf', TfidfVectorizer(analyzer='char_wb', token_pattern=r"(?u)\b\w+\b", min_df=1)),
                ])),
                ('处理意见及结果', Pipeline([
                    ('selector', ItemSelector(key='处理意见及结果')),
                    ('tfidf', TfidfVectorizer(analyzer='char_wb', token_pattern=r"(?u)\b\w+\b", min_df=1)),
                ])),
            ],
            transformer_weights={
                '故障现象': 2.0,
                '原因分析': 1.5,
                '处理意见及结果': 1.0,
            },
        )),
        # ('svc', SVC(kernel='linear')),
        ('RFC', RandomForestClassifier())
    ])
    classifier.fit(feature_dict, dummy_y)
    helper.dump(classifier, name + "_model.sav")
    score = classifier.score(feature_dict, dummy_y)
    print(score)  # 1.0 PS: 样本太少
Exemple #5
0
"""保存一般模型"""
import numpy as np
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer
from sklearn.pipeline import Pipeline, FeatureUnion

from piz_ml.skl import helper
from piz_ml.skl.Test04 import read_dataset, ItemSelector

if __name__ == "__main__":
    (feature_list, label_list) = read_dataset("sample04_test")
    feature_list = np.array(feature_list)
    stop_words = helper.read_stopwords()
    # feature_list = feature_list[:, 0]
    #
    union = FeatureUnion(
        transformer_list=[
            ("feature",
             Pipeline([('selector', ItemSelector(1)),
                       ("vec", DictVectorizer(sparse=False))])),
            (
                "content",
                Pipeline([
                    ('selector', ItemSelector(0)),
                    (
                        'cvec',
                        CountVectorizer(
                            # analyzer='char_wb',
                            token_pattern=r"(?u)\b\w+\b",
                            min_df=1,
                            stop_words=stop_words)),