Esempio n. 1
0
def build_sentiment(classifier, name, with_proba=True, **pmml_options):
    pipeline = PMMLPipeline([
        ("tf-idf",
         TfidfVectorizer(
             analyzer="word",
             preprocessor=None,
             strip_accents=None,
             lowercase=True,
             token_pattern=None,
             tokenizer=Splitter(),
             stop_words="english",
             ngram_range=(1, 2),
             norm=None,
             dtype=(numpy.float32 if isinstance(
                 classifier, RandomForestClassifier) else numpy.float64))),
        ("selector", SelectKBest(f_classif, k=500)), ("classifier", classifier)
    ])
    pipeline.fit(sentiment_X, sentiment_y)
    pipeline.configure(**pmml_options)
    store_pkl(pipeline, name + ".pkl")
    score = DataFrame(pipeline.predict(sentiment_X), columns=["Score"])
    if with_proba == True:
        score_proba = DataFrame(pipeline.predict_proba(sentiment_X),
                                columns=["probability(0)", "probability(1)"])
        score = pandas.concat((score, score_proba), axis=1)
    store_csv(score, name + ".csv")
 def test_call(self):
     splitter = Splitter()
     self.assertEqual((), splitter(""))
     self.assertEqual((), splitter("."))
     self.assertEqual(("one", ), splitter("one"))
     self.assertEqual(("++one", ), splitter("++one"))
     self.assertEqual(("one++", ), splitter("one++"))
     self.assertEqual(("one", ), splitter("--one"))
     self.assertEqual(("one", ), splitter("one--"))
     self.assertEqual(("one", "two", "three"), splitter("one two three"))
     self.assertEqual(("one", "t,w.o", "three"),
                      splitter(",one _t,w.o_ three."))
Esempio n. 3
0
		("densifier", DenseTransformer()),
		("selector", SelectKBest(f_classif, k = 500)),
		("classifier", classifier)
	])
	pipeline.fit(sentiment_X, sentiment_y)
	pipeline.configure(**pmml_options)
	store_pmml(pipeline, name)
	score = DataFrame(pipeline.predict(sentiment_X), columns = ["Score"])
	if with_proba:
		score_proba = DataFrame(pipeline.predict_proba(sentiment_X), columns = ["probability(0)", "probability(1)"])
		score = pandas.concat((score, score_proba), axis = 1)
	store_csv(score, name)

if "Sentiment" in datasets:
	pmml_textindex_args = dict(analyzer = "word", preprocessor = None, strip_accents = None, dtype = numpy.float64)
	build_sentiment(LinearDiscriminantAnalysis(), TfidfVectorizer(tokenizer = Splitter(), ngram_range = (1, 3), norm = None, **pmml_textindex_args), "LinearDiscriminantAnalysisSentiment")
	build_sentiment(LinearSVC(random_state = 13), CountVectorizer(tokenizer = Splitter(), ngram_range = (1, 2), **pmml_textindex_args), "LinearSVCSentiment", with_proba = False)
	build_sentiment(LogisticRegression(multi_class = "ovr"), TfidfVectorizer(stop_words = "english", tokenizer = Matcher(), ngram_range = (1, 3), binary = True, norm = None, **pmml_textindex_args), "LogisticRegressionSentiment")
	build_sentiment(RandomForestClassifier(max_depth = 8, min_samples_leaf = 10, n_estimators = 31, random_state = 13), CountVectorizer(ngram_range = (1, 2), **pmml_textindex_args), "RandomForestSentiment")
	build_sentiment(XGBClassifier(objective = "binary:logistic", ntree_limit = 31, random_state = 13), CountVectorizer(tokenizer = Matcher(), **pmml_textindex_args), "XGBoostSentiment")

#
# Multi-class classification
#

def load_iris(name):
	df = load_csv(name)
	return split_csv(df)

iris_X, iris_y = load_iris("Iris")
    n = typicalNDict[name]
    return group.sample(n=n)


major = pd.read_csv(r"C:\Users\钟顺民\Desktop\6.csv", sep=",", encoding='ISO-8859-1') \
    .dropna().groupby('id', as_index=False, group_keys=False) \
    .apply(typicalsamling, typicalNDict_Major)

# 分配数据
X = major.drop(['id'], axis=1)
Y = major["id"]
print("data done!")

pipeline = PMMLPipeline([
    ('mapper', DataFrameMapper([
        ('name', TfidfVectorizer(norm=None, analyzer="word", max_features=500, tokenizer=Splitter())),
        ('description', TfidfVectorizer(norm=None, analyzer="word", max_features=1000, tokenizer=Splitter()))
    ])),
    ('model', SVC(max_iter=10000)),  # train on TF-IDF vectors w/ Linear SVM classifier
])
print("model set done!")
pipeline.fit(X, Y)
print("model fit done!")
c = pd.read_csv(r"C:\Users\钟顺民\Desktop\6.csv", sep=',', encoding='ISO-8859-1').dropna().sample(n=200)

prediction = pipeline.predict(c.drop(['id'], axis=1))
t = c['id']
print("Accuracy Score ->", accuracy_score(prediction, t) * 100)
"""
Accuracy Score -> 98.5
"""
Esempio n. 5
0
			("count", WordCountTransformer())
		])),
		("selector", SelectKBest(f_classif, k = 1000)),
		("classifier", classifier)
	])
	pipeline.fit(sentiment_X, sentiment_y)
	pipeline.configure(**pmml_options)
	store_pkl(pipeline, name)
	score = DataFrame(pipeline.predict(sentiment_X), columns = ["Score"])
	if with_proba == True:
		score_proba = DataFrame(pipeline.predict_proba(sentiment_X), columns = ["probability(0)", "probability(1)"])
		score = pandas.concat((score, score_proba), axis = 1)
	store_csv(score, name)

if "Sentiment" in datasets:
	build_sentiment(LinearSVC(random_state = 13), Splitter(), "LinearSVCSentiment", with_proba = False)
	build_sentiment(LogisticRegressionCV(cv = 3), None, "LogisticRegressionSentiment")
	build_sentiment(RandomForestClassifier(n_estimators = 10, min_samples_leaf = 3, random_state = 13), Matcher(), "RandomForestSentiment", compact = False)

#
# Regression
#

auto_X, auto_y = load_auto("Auto")

auto_X["cylinders"] = auto_X["cylinders"].astype(int)
auto_X["model_year"] = auto_X["model_year"].astype(int)
auto_X["origin"] = auto_X["origin"].astype(int)

def build_auto(regressor, name, fit_params = {}, predict_params = {}, **pmml_options):
	cylinders_origin_mapping = {
    data = pd.read_csv(r"../data/major_sample.csv", encoding='ISO-8859-1')
    return data


if __name__ == '__main__':
    samples = get_major_data()
    X = samples.drop(['id'], axis=1)
    Y = samples["id"]

    pipeline = PMMLPipeline([
        ('mapper',
         DataFrameMapper([('name',
                           TfidfVectorizer(norm=None,
                                           analyzer="word",
                                           max_features=1000,
                                           tokenizer=Splitter())),
                          ('description',
                           TfidfVectorizer(norm=None,
                                           analyzer="word",
                                           max_features=1000,
                                           tokenizer=Splitter()))])),
        ('model', SVC(max_iter=10000)
         ),  # train on TF-IDF vectors w/ Linear SVM classifier
    ])
    print("model set done!")

    pipeline.fit(X, Y)
    print("model fit done!")

    c = pd.read_csv(r"../data/klarna 2.csv",
                    encoding='ISO-8859-1').sample(n=200)
Esempio n. 7
0
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn2pmml.feature_extraction.text import Splitter
from sklearn.cluster import KMeans
from sklearn2pmml.pipeline import PMMLPipeline
from sklearn2pmml import sklearn2pmml

# 构建pipeline
pipeline = PMMLPipeline([("td_vector",
                          TfidfVectorizer(max_df=0.7,
                                          min_df=0.01,
                                          tokenizer=Splitter(),
                                          norm=None)),
                         ("km", KMeans(n_clusters=100, random_state=1000))])
# 注意:PMMLPipeline的最后一个必须是评估器;TfidfVectorizer不能使用norm,而且分词器需要使用Splitter()
# 训练模型 sentences为空格分词的句子或者文件
pipeline.fit(sentences)
# 保存pipeline模型
sklearn2pmml(pipeline, "hzd.pmml")
# 预测结果
print(pipeline.predict(sentences))
 def test_pickle(self):
     splitter = Splitter("\W")
     self.assertEqual("\W", splitter.separator_re)
     splitter_clone = SplitterTest._clone(splitter)
     self.assertEqual("\W", splitter_clone.separator_re)
Esempio n. 9
0
from sklearn2pmml.pipeline import PMMLPipeline
from sklearn2pmml.preprocessing import PMMLLabelEncoder
from sklearn2pmml.feature_extraction.text import Splitter
from sklearn_pandas import DataFrameMapper

binary = False

data = pd.read_csv("test/support/mpg.csv")
if binary:
    data["drv"] = data["drv"].replace("r", "4")

numeric_features = ["displ", "year", "cyl"]
categorical_features = ["class"]
text_features = []

mapper = DataFrameMapper(
    [(numeric_features, [ContinuousDomain()])] +
    [([f], [CategoricalDomain(), PMMLLabelEncoder()])
     for f in categorical_features] +
    [(f, [CategoricalDomain(),
          CountVectorizer(tokenizer=Splitter())]) for f in text_features])

pipeline = PMMLPipeline([("mapper", mapper),
                         ("model", LGBMClassifier(n_estimators=1000))])
pipeline.fit(data, data["drv"], model__categorical_feature=[3])

suffix = "binary" if binary else "multiclass"
sklearn2pmml(pipeline, "test/support/python/lightgbm_" + suffix + ".pmml")

print(pipeline.predict(data[:10]))
Esempio n. 10
0
from common import *

from sklearn2pmml.feature_extraction.text import Matcher, Splitter

sentiment_X, sentiment_y = load_sentiment("Sentiment")

stop_words = [
    "a", "and", "are", "d", "i", "is", "it", "ll", "m", "s", "the", "ve", "we",
    "you"
]


def tokenize(tokenizer, name):
    def process(line):
        tokens = tokenizer(line.lower())
        tokens = [token for token in tokens if token not in stop_words]
        return "\t".join(tokens)

    sentiment_processed_X = sentiment_X.apply(process)
    store_csv(sentiment_processed_X, name)


tokenize(Matcher("(?u)\\b\\w\\w+\\b"), "CountVectorizerSentiment")

tokenize(Matcher("\\w+"), "MatcherSentiment")
tokenize(Splitter("\\s+"), "SplitterSentiment")
Esempio n. 11
0
from sklearn.feature_extraction.text import CountVectorizer
from lightgbm import LGBMRegressor
from sklearn2pmml import sklearn2pmml
from sklearn2pmml.decoration import ContinuousDomain, CategoricalDomain
from sklearn2pmml.pipeline import PMMLPipeline
from sklearn2pmml.preprocessing import PMMLLabelEncoder
from sklearn2pmml.feature_extraction.text import Splitter
from sklearn_pandas import DataFrameMapper

data = pd.read_csv("test/support/mpg.csv")

numeric_features = ["displ", "year", "cyl"]
categorical_features = ["drv", "class"]
text_features = ["model"]

mapper = DataFrameMapper(
  [(numeric_features, [ContinuousDomain()])] +
  [([f], [CategoricalDomain(), PMMLLabelEncoder()]) for f in categorical_features] +
  [(f, [CategoricalDomain(), CountVectorizer(tokenizer=Splitter(), max_features=5)]) for f in text_features]
)

pipeline = PMMLPipeline([
  ("mapper", mapper),
  ("model", LGBMRegressor(n_estimators=1000))
])
pipeline.fit(data, data["hwy"], model__categorical_feature=[3, 4])

sklearn2pmml(pipeline, "test/support/python/lightgbm_regression.pmml")

print(pipeline.predict(data[:10]))
Esempio n. 12
0
		("densifier", DenseTransformer()),
		("selector", SelectKBest(f_classif, k = 500)),
		("classifier", classifier)
	])
	pipeline.fit(sentiment_X, sentiment_y)
	pipeline.configure(**pmml_options)
	store_pmml(pipeline, name)
	score = DataFrame(pipeline.predict(sentiment_X), columns = ["Score"])
	if with_proba:
		score_proba = DataFrame(pipeline.predict_proba(sentiment_X), columns = ["probability(0)", "probability(1)"])
		score = pandas.concat((score, score_proba), axis = 1)
	store_csv(score, name)

if "Sentiment" in datasets:
	pmml_textindex_args = dict(analyzer = "word", preprocessor = None, strip_accents = None, dtype = numpy.float64)
	build_sentiment(LinearSVC(random_state = 13), CountVectorizer(tokenizer = Splitter(), ngram_range = (1, 2), **pmml_textindex_args), "LinearSVCSentiment", with_proba = False)
	build_sentiment(LogisticRegression(multi_class = "ovr"), TfidfVectorizer(stop_words = "english", tokenizer = Matcher(), ngram_range = (1, 3), norm = None, **pmml_textindex_args), "LogisticRegressionSentiment")
	build_sentiment(RandomForestClassifier(max_depth = 8, min_samples_leaf = 10, n_estimators = 31, random_state = 13), CountVectorizer(ngram_range = (1, 2), **pmml_textindex_args), "RandomForestSentiment")
	build_sentiment(XGBClassifier(objective = "binary:logistic", ntree_limit = 31, random_state = 13), CountVectorizer(tokenizer = Matcher(), **pmml_textindex_args), "XGBoostSentiment")

#
# Multi-class classification
#

def load_iris(name):
	df = load_csv(name)
	return split_csv(df)

iris_X, iris_y = load_iris("Iris")

def build_iris(classifier, name, **pmml_options):
Esempio n. 13
0
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LinearRegression
from sklearn2pmml import sklearn2pmml
from sklearn2pmml.decoration import ContinuousDomain, CategoricalDomain
from sklearn2pmml.pipeline import PMMLPipeline
from sklearn2pmml.feature_extraction.text import Splitter
from sklearn_pandas import DataFrameMapper

data = pd.read_csv("test/support/mpg.csv")

numeric_features = ["displ", "year", "cyl"]
categorical_features = ["drv", "class"]
text_features = ["model"]

mapper = DataFrameMapper(
    [(numeric_features, [ContinuousDomain()])] +
    [([f], [CategoricalDomain(), OneHotEncoder()])
     for f in categorical_features] + [(f, [
         CategoricalDomain(),
         CountVectorizer(tokenizer=Splitter(), max_features=5)
     ]) for f in text_features])

pipeline = PMMLPipeline([("mapper", mapper), ("model", LinearRegression())])
pipeline.fit(data, data["hwy"])

sklearn2pmml(pipeline, "test/support/python/linear_regression_text.pmml")

print(list(pipeline.predict(data[:10])))