# data = np.loadtxt(path, dtype=float, delimiter=',', # converters={4: iris_type}) data = pd.read_csv(path, header=None) data[4] = pd.Categorical(data[4]).codes # iris_types = data[4].unique() # print iris_types # for i, type in enumerate(iris_types): # data.set_value(data[4] == type, 4, i) x, y = np.split(data.values, (4, ), axis=1) # print 'x = \n', x # print 'y = \n', y # 仅使用前两列特征 x = x[:, :2] lr = Pipeline([('sc', StandardScaler()), ('poly', PolynomialFeatures(degree=3)), ('clf', LogisticRegression())]) lr.fit(x, y.ravel()) y_hat = lr.predict(x) y_hat_prob = lr.predict_proba(x) np.set_printoptions(suppress=True) print 'y_hat = \n', y_hat print 'y_hat_prob = \n', y_hat_prob print u'准确度:%.2f%%' % (100 * np.mean(y_hat == y.ravel())) # 画图 N, M = 500, 500 # 横纵各采样多少个值 x1_min, x1_max = x[:, 0].min(), x[:, 0].max() # 第0列的范围 x2_min, x2_max = x[:, 1].min(), x[:, 1].max() # 第1列的范围 t1 = np.linspace(x1_min, x1_max, N) t2 = np.linspace(x2_min, x2_max, M) x1, x2 = np.meshgrid(t1, t2) # 生成网格采样点
import machine_learning.utility as utility import enums import time from datetime import datetime # 로드 데이터 any_football_data = load_data(enums.SportsType.Football, enums.CampType.Any) # 학습 데이터셋 train_X = any_football_data.drop(label_columns, axis=1) train_Y = any_football_data['score'] # 경기 결과 (승=0, 무=1, 패=2) # 특성 처리 number_pipeline_home = Pipeline([ ("poly", PolynomialFeatures(degree=2, include_bias=False)), ("scaler", StandardScaler()), ]) feature_pipeline = ColumnTransformer([ ("num", number_pipeline_home, num_attribs), ("cat", OneHotEncoder(), cat_attribs_with_camp), ]) # 모델 훈련 prepared_train = feature_pipeline.fit_transform(train_X) print(f'prepared_train data_set shape: {prepared_train.shape}') start_prepared = time.time() lin_reg = LinearRegression() lin_reg.fit(prepared_train, train_Y) # 훈련
scalar = MinMaxScaler() pca = PCA(svd_solver='randomized', random_state=42) cv = StratifiedShuffleSplit(y, 100, random_state=42) print "done" # GaussianNB from sklearn.naive_bayes import GaussianNB naive_clf = GaussianNB() parameters = {} naive_clf_grid = GridSearchCV(naive_clf, parameters, cv=cv, scoring='f1') naive_clf_grid.fit(X, y) print "before pca f1: ", naive_clf_grid.best_score_ pca_naive_clf = Pipeline([('pca', pca), ('svc', naive_clf)]) parameters = {'pca__n_components': range(1, 5)} pca_naive_clf_grid = GridSearchCV(pca_naive_clf, parameters, cv=cv, scoring='f1') pca_naive_clf_grid.fit(X, y) print "after pca f1: ", pca_naive_clf_grid.best_score_ ### Task 5: Tune your classifier to achieve better than .3 precision and recall ### using our testing script. Check the tester.py script in the final project ### folder for details on the evaluation method, especially the test_classifier ### function. Because of the small size of the dataset, the script uses ### stratified shuffle split cross validation. For more info: ### http://scikit-learn.org/stable/modules/generated/sklearn.cross_validation.StratifiedShuffleSplit.html
self.attribute_names = attribute_names def fit(self, X, y=None): return self def transform(self, X): return X[self.attribute_names].values # list of attributes for the DataFrameSelector (pandas to numpy) room_attrib = [ attr for attr in list(room_data) if not re.search(attr, r'date|Occupancy') ] print(room_attrib) pipeline = Pipeline([ ('selector', DataFrameSelector(room_attrib)), ('std_scaler', StandardScaler()), ]) # axis=1 implies column room_prepared = pipeline.fit_transform(room_data) print(room_prepared) # %load train.py import os as os import numpy as np import scipy.io import scipy.optimize as optimization N_EPOCH = 1 n_mis = []
prediction = clf.predict(predict_me) if prediction == y[i]: correct += 1 print (float(correct)/float(len(X))) ''' from sklearn.linear_model.logistic import LogisticRegression from sklearn.model_selection import GridSearchCV from sklearn import decomposition from sklearn.pipeline import Pipeline logistic = LogisticRegression() pca = decomposition.PCA() pipe = Pipeline(steps=[('pca', pca), ('logistic', logistic)]) X = np.array(df.drop('survived', 1)) X = preprocessing.scale(X) print X.shape y = np.array(df['survived']) print y.shape clf = pca.fit_transform(X, y) plt.figure(1, figsize=(5, 5)) plt.clf() plt.axes([.2, .2, .7, .7]) plt.plot(pca.explained_variance_, linewidth=2) plt.axis('tight') plt.xlabel('n_components') plt.ylabel('explained_variance_') n_components = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]
y = digits.target # Throw away data, to be in the curse of dimension settings y = y[:200] X = digits.data[:200] n_samples = len(y) X = X.reshape((n_samples, -1)) # add 200 non-informative features X = np.hstack((X, 2 * np.random.random((n_samples, 200)))) ################################################################################ # Create a feature-selection transform and an instance of SVM that we # combine together to have an full-blown estimator transform = feature_selection.SelectPercentile(feature_selection.f_classif) clf = Pipeline([('anova', transform), ('svc', svm.SVC())]) ################################################################################ # Plot the cross-validation score as a function of percentile of features score_means = list() score_stds = list() percentiles = (1, 3, 6, 10, 15, 20, 30, 40, 60, 80, 100) for percentile in percentiles: clf.set_params(anova__percentile=percentile) # Compute cross-validation score using all CPUs this_scores = cross_validation.cross_val_score(clf, X, y, n_jobs=1) score_means.append(this_scores.mean()) score_stds.append(this_scores.std()) pl.errorbar(percentiles, score_means, np.array(score_stds))
# load data url = os.path.join(os.getcwd(), 'pima-indians-diabetes.data.csv') names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class'] dataframe = read_csv(url, names=names, sep=',') # print(dataframe) array = dataframe.values X = array[:, 0:8] Y = array[:, 8] # create a feature union features = [] features.append(('pca', PCA(n_components=3))) features.append(('select_best', SelectKBest(k=6))) feature_union = FeatureUnion(features) # create a pipeline estimators = [] # estimators.append(('standardize', StandardScaler())) # estimators.append(('lda', LinearDiscriminantAnalysis())) estimators.append(('feature_union', feature_union)) estimators.append(('logistic', LogisticRegression())) model = Pipeline(estimators) # evaluate pipeline seed = 7 kfold = KFold(n_splits=10, random_state=seed) results = cross_val_score(model, X, Y, cv=kfold) print(results.mean())
import views_utils.dbutils as dbutils sys.path.insert(0, "../../../osa") from osa.wrapper_sm import SMLogit import osa.utils as osa uname = "VIEWSADMIN" prefix = "postgresql" db = "views" port = "5432" hostname = "VIEWSHOST" connectstring = dbutils.make_connectstring(prefix, db, uname, hostname, port) rf_500 = RandomForestClassifier(n_estimators=500, n_jobs=10) scaler = StandardScaler() pipe_rf_500 = Pipeline([('scaler', scaler), ('rf', rf_500)]) output_schema = "landed_test" output_table = "osa_pgm_acled_histonly_fcast_calib_sb" models = [{ "dir_pickles": "$SNIC_TMP/osa/pickles/osa_pgm_acled_histonly_fcast_calib_sb/pgm_acled_histonly_fcast_calib_logit_fullsample_sb", "estimator": SMLogit(), "features": [ "l2_ged_dummy_sb", "l3_ged_dummy_sb", "l4_ged_dummy_sb", "l5_ged_dummy_sb", "l6_ged_dummy_sb", "l7_ged_dummy_sb", "l8_ged_dummy_sb", "l9_ged_dummy_sb", "l10_ged_dummy_sb", "l11_ged_dummy_sb", "l12_ged_dummy_sb", "q_1_1_l2_ged_dummy_sb", "q_1_1_l3_ged_dummy_sb", "l1_ged_dummy_sb", "l1_ged_dummy_ns",
4.4 Learning curve ------------------------------------------------------------------------------------------------------------------------ ''' print('---------------------------------------------------------------------------------------------------------------\n' ' 4.4 Learning curve \n' '---------------------------------------------------------------------------------------------------------------\n') from sklearn.preprocessing import StandardScaler from sklearn.pipeline import Pipeline for style, width, degree in (("g-", 1, 300), ("b--", 2, 2), ("r-+", 2, 1)): polybig_features = PolynomialFeatures(degree=degree, include_bias=False) std_scaler = StandardScaler() lin_reg = LinearRegression() polynomial_regression = Pipeline([ ("poly_features", polybig_features), ("std_scaler", std_scaler), ("lin_reg", lin_reg), ]) polynomial_regression.fit(X, y) y_newbig = polynomial_regression.predict(X_new) plt.plot(X_new, y_newbig, style, label=str(degree), linewidth=width) plt.plot(X, y, "b.", linewidth=3) plt.legend(loc="upper left") plt.xlabel("$x_1$", fontsize=18) plt.ylabel("$y$", rotation=0, fontsize=18) plt.axis([-3, 3, 0, 10]) save_fig("high_degree_polynomials_plot") plt.show() print()
def make(): return Pipeline([('m2', mult2), ('m3', mult3), ('last', mult5)])
data_set = DataSet() data, label, class_names = data_set.get_train_data_set() indexs = random.sample(range(len(data)), 50000) data = data[indexs] label = label[indexs] X_train, X_test, y_train, y_test = train_test_split(data, label, test_size=0.33, random_state=42) est = [('count_vect', CountVectorizer()), ('tr', TruncatedSVD(n_components=10, n_iter=100, random_state=42)), ('clf_DT', DecisionTreeClassifier())] pipeline_DT = Pipeline(est) pipeline_DT = pipeline_DT.fit(X_train, y_train) y_pred = pipeline_DT.predict(X_test) print("F1 score - DT:", f1_score(y_test, pipeline_DT.predict(X_test), average='micro')) print("Accuracy Score - DT:", accuracy_score(y_test, pipeline_DT.predict(X_test))) cnf_matrix = confusion_matrix(y_test, y_pred) plt.figure() plt = plot_confusion_matrix(cnf_matrix, classes=class_names, normalize=True, title='Normalized confusion matrix DT') plt.show()
def test_set_pipeline_step_none(): # Test setting Pipeline steps to None X = np.array([[1]]) y = np.array([1]) mult2 = Mult(mult=2) mult3 = Mult(mult=3) mult5 = Mult(mult=5) def make(): return Pipeline([('m2', mult2), ('m3', mult3), ('last', mult5)]) pipeline = make() exp = 2 * 3 * 5 assert_array_equal([[exp]], pipeline.fit_transform(X, y)) assert_array_equal([exp], pipeline.fit(X).predict(X)) assert_array_equal(X, pipeline.inverse_transform([[exp]])) pipeline.set_params(m3=None) exp = 2 * 5 assert_array_equal([[exp]], pipeline.fit_transform(X, y)) assert_array_equal([exp], pipeline.fit(X).predict(X)) assert_array_equal(X, pipeline.inverse_transform([[exp]])) assert_dict_equal( pipeline.get_params(deep=True), { 'steps': pipeline.steps, 'm2': mult2, 'm3': None, 'last': mult5, 'memory': None, 'm2__mult': 2, 'last__mult': 5, }) pipeline.set_params(m2=None) exp = 5 assert_array_equal([[exp]], pipeline.fit_transform(X, y)) assert_array_equal([exp], pipeline.fit(X).predict(X)) assert_array_equal(X, pipeline.inverse_transform([[exp]])) # for other methods, ensure no AttributeErrors on None: other_methods = [ 'predict_proba', 'predict_log_proba', 'decision_function', 'transform', 'score' ] for method in other_methods: getattr(pipeline, method)(X) pipeline.set_params(m2=mult2) exp = 2 * 5 assert_array_equal([[exp]], pipeline.fit_transform(X, y)) assert_array_equal([exp], pipeline.fit(X).predict(X)) assert_array_equal(X, pipeline.inverse_transform([[exp]])) pipeline = make() pipeline.set_params(last=None) # mult2 and mult3 are active exp = 6 assert_array_equal([[exp]], pipeline.fit(X, y).transform(X)) assert_array_equal([[exp]], pipeline.fit_transform(X, y)) assert_array_equal(X, pipeline.inverse_transform([[exp]])) assert_raise_message(AttributeError, "'NoneType' object has no attribute 'predict'", getattr, pipeline, 'predict') # Check None step at construction time exp = 2 * 5 pipeline = Pipeline([('m2', mult2), ('m3', None), ('last', mult5)]) assert_array_equal([[exp]], pipeline.fit_transform(X, y)) assert_array_equal([exp], pipeline.fit(X).predict(X)) assert_array_equal(X, pipeline.inverse_transform([[exp]]))
def test_pipeline_init(): # Test the various init parameters of the pipeline. assert_raises(TypeError, Pipeline) # Check that we can't instantiate pipelines with objects without fit # method assert_raises_regex( TypeError, 'Last step of Pipeline should implement fit. ' '.*NoFit.*', Pipeline, [('clf', NoFit())]) # Smoke test with only an estimator clf = NoTrans() pipe = Pipeline([('svc', clf)]) assert_equal( pipe.get_params(deep=True), dict(svc__a=None, svc__b=None, svc=clf, **pipe.get_params(deep=False))) # Check that params are set pipe.set_params(svc__a=0.1) assert_equal(clf.a, 0.1) assert_equal(clf.b, None) # Smoke test the repr: repr(pipe) # Test with two objects clf = SVC() filter1 = SelectKBest(f_classif) pipe = Pipeline([('anova', filter1), ('svc', clf)]) # Check that we can't instantiate with non-transformers on the way # Note that NoTrans implements fit, but not transform assert_raises_regex( TypeError, 'All intermediate steps should be transformers' '.*\\bNoTrans\\b.*', Pipeline, [('t', NoTrans()), ('svc', clf)]) # Check that params are set pipe.set_params(svc__C=0.1) assert_equal(clf.C, 0.1) # Smoke test the repr: repr(pipe) # Check that params are not set when naming them wrong assert_raises(ValueError, pipe.set_params, anova__C=0.1) # Test clone pipe2 = clone(pipe) assert_false(pipe.named_steps['svc'] is pipe2.named_steps['svc']) # Check that apart from estimators, the parameters are the same params = pipe.get_params(deep=True) params2 = pipe2.get_params(deep=True) for x in pipe.get_params(deep=False): params.pop(x) for x in pipe2.get_params(deep=False): params2.pop(x) # Remove estimators that where copied params.pop('svc') params.pop('anova') params2.pop('svc') params2.pop('anova') assert_equal(params, params2)
plot.title('confusion matrix') plot.colorbar() plot.ylabel('expected label') plot.xlabel('predicted label') print classification_report(emails['label'], all_predictions) #Dividing data set msg_train, msg_test, label_train, label_test = \ train_test_split(emails['message'], emails['label'], test_size=0.2) print len(msg_train), len(msg_test), len(msg_train) + len(msg_test) pipeline = Pipeline([ ('bow', CountVectorizer(analyzer=lemmatize)), # strings to token integer counts ('tfidf', TfidfTransformer()), # integer counts to weighted TF-IDF scores ('classifier', MultinomialNB()), # train on TF-IDF vectors w/ Naive Bayes classifier ]) scores = cross_val_score( pipeline, # steps to convert raw emails into models msg_train, # training data label_train, # training labels cv=10, # split data randomly into 10 parts: 9 for training, 1 for scoring scoring='accuracy', # which scoring metric? n_jobs=-1, # -1 = use all cores = faster ) print scores
categorical_transformer, select_features_cat) # Train model # TODO try semi supervised learning # Setting random state forces the classifier to produce the same result in each run n_cv = 5 # cv=5 is default scorer = "accuracy" # model = RandomForestClassifier(random_state=random_state) model = xgb.XGBClassifier() pipe = Pipeline(steps=[ ('preprocessor', preprocessor), ('scaler', scaler), # ('pca', pca), ('model', model) ]) param_grid = { # 'preprocessor__num__imputer__strategy': ['mean', 'median'], # 'pca__n_components': [5, 15, 30, 45, 64], 'model__n_estimators': [10, 50, 75, 100, 125, 200, 300], # usually max_depth is 6,7,8 'model__max_depth': list(range(2, 10)), # learning rate is around 0.05, but small changes may make big diff 'model__learning_rate': [0.03, 0.05, 0.07, 0.09, 0.1], # 'model__subsample': list(map(lambda x: x * 0.1, range(1, 10))),
def test_pipeline_column_transformer(self): iris = datasets.load_iris() X = iris.data[:, :3] y = iris.target X_train = pandas.DataFrame(X, columns=["vA", "vB", "vC"]) X_train["vcat"] = X_train["vA"].apply(lambda x: "cat1" if x > 0.5 else "cat2") X_train["vcat2"] = X_train["vB"].apply(lambda x: "cat3" if x > 0.5 else "cat4") y_train = y % 2 numeric_features = [0, 1, 2] # ["vA", "vB", "vC"] categorical_features = [3, 4] # ["vcat", "vcat2"] classifier = LogisticRegression( C=0.01, class_weight=dict(zip([False, True], [0.2, 0.8])), n_jobs=1, max_iter=10, solver="lbfgs", tol=1e-3) numeric_transformer = Pipeline(steps=[ ("imputer", SimpleImputer(strategy="median")), ("scaler", StandardScaler()), ]) categorical_transformer = Pipeline(steps=[ ( "onehot", OneHotEncoder(sparse=True, handle_unknown="ignore"), ), ( "tsvd", TruncatedSVD(n_components=1, algorithm="arpack", tol=1e-4), ), ]) preprocessor = ColumnTransformer(transformers=[ ("num", numeric_transformer, numeric_features), ("cat", categorical_transformer, categorical_features), ]) model = Pipeline(steps=[("precprocessor", preprocessor), ("classifier", classifier)]) model.fit(X_train, y_train) initial_type = [ ("numfeat", FloatTensorType([None, 3])), ("strfeat", StringTensorType([None, 2])), ] X_train = X_train[:11] model_onnx = convert_sklearn(model, initial_types=initial_type, target_opset=TARGET_OPSET) dump_data_and_model( X_train, model, model_onnx, basename="SklearnPipelineColumnTransformerPipeliner") if __name__ == "__main__": from onnx.tools.net_drawer import GetPydotGraph, GetOpNodeProducer pydot_graph = GetPydotGraph( model_onnx.graph, name=model_onnx.graph.name, rankdir="TP", node_producer=GetOpNodeProducer("docstring")) pydot_graph.write_dot("graph.dot") import os os.system("dot -O -G=300 -Tpng graph.dot")
print('sizes_neg = ' + str(b)) c = np.median(np.array(sizes_neu)) print('sizes_neu = ' + str(c)) ''' print('#### Preprocessing done') # build models print('#### Building models started') # this is not best solution but for simplify model averaging we will # use CountVectorizer and TfidfTransformer for 3 times on same data pipeline_nb = Pipeline([('vect', CountVectorizer(lowercase=False, max_df=0.8, max_features=50000, ngram_range=(1, 3))), ('tfidf', TfidfTransformer()), ('clf', MultinomialNB())]) pipeline_sgd = Pipeline([('vect', CountVectorizer(lowercase=False, max_df=0.8, max_features=50000, ngram_range=(1, 3))), ('tfidf', TfidfTransformer()), ('clf', SGDClassifier())]) pipeline_lr = Pipeline([('vect', CountVectorizer(lowercase=False, max_df=0.8,
def test_pipeline_column_transformer_titanic(self): # fit try: titanic_url = ( "https://raw.githubusercontent.com/amueller/" "scipy-2017-sklearn/091d371/notebooks/datasets/titanic3.csv") data = pandas.read_csv(titanic_url) except url_error.URLError: # Do not fail the test if the data cannot be fetched. warnings.warn("Unable to fetch titanic data.") return X = data.drop("survived", axis=1) y = data["survived"] # SimpleImputer on string is not available for string # in ONNX-ML specifications. # So we do it beforehand. for cat in ["embarked", "sex", "pclass"]: X[cat].fillna("missing", inplace=True) X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2) numeric_features = ["age", "fare"] numeric_transformer = Pipeline(steps=[ ("imputer", SimpleImputer(strategy="median")), ("scaler", StandardScaler()), ]) categorical_features = ["embarked", "sex", "pclass"] categorical_transformer = Pipeline(steps=[ # --- SimpleImputer on string is not available # for string in ONNX-ML specifications. # ('imputer', # SimpleImputer(strategy='constant', fill_value='missing')), ("onehot", OneHotEncoder(handle_unknown="ignore")) ]) preprocessor = ColumnTransformer(transformers=[ ("num", numeric_transformer, numeric_features), ("cat", categorical_transformer, categorical_features), ]) clf = Pipeline(steps=[ ("preprocessor", preprocessor), # ("classifier", LogisticRegression(solver="lbfgs")), ]) # inputs def convert_dataframe_schema(df, drop=None): inputs = [] for k, v in zip(df.columns, df.dtypes): if drop is not None and k in drop: continue if v == 'int64': t = Int64TensorType([None, 1]) elif v == "float64": t = FloatTensorType([None, 1]) else: t = StringTensorType([None, 1]) inputs.append((k, t)) return inputs to_drop = { "parch", "sibsp", "cabin", "ticket", "name", "body", "home.dest", "boat", } X_train = X_train.copy() X_test = X_test.copy() X_train['pclass'] = X_train['pclass'].astype(numpy.int64) X_test['pclass'] = X_test['pclass'].astype(numpy.int64) X_train = X_train.drop(to_drop, axis=1) X_test = X_test.drop(to_drop, axis=1) # Step 1: without classifier clf.fit(X_train, y_train) initial_inputs = convert_dataframe_schema(X_train, to_drop) model_onnx = convert_sklearn(clf, "pipeline_titanic", initial_inputs, target_opset=TARGET_OPSET) data = X_test pred = clf.transform(data) data_types = { 'pclass': numpy.int64, 'age': numpy.float32, 'sex': numpy.str_, 'fare': numpy.float32, 'embarked': numpy.str_, } inputs = {k: data[k].values.astype(data_types[k]).reshape(-1, 1) for k in data.columns} sess = InferenceSession(model_onnx.SerializeToString()) run = sess.run(None, inputs) got = run[-1] assert_almost_equal(pred, got, decimal=5) # Step 2: with classifier clf = Pipeline(steps=[ ("preprocessor", preprocessor), ("classifier", LogisticRegression(solver="lbfgs")), ]).fit(X_train, y_train) pred = clf.predict_proba(data) model_onnx = convert_sklearn(clf, "pipeline_titanic", initial_inputs, target_opset=TARGET_OPSET, options={id(clf): {'zipmap': False}}) sess = InferenceSession(model_onnx.SerializeToString()) run = sess.run(None, inputs) got = run[-1] assert_almost_equal(pred, got, decimal=5)
def parse_newsdata(): # Parse news json files X_buzz, y_buzz, X_poli, y_poli = [], [], [], [] i = 0 for dataset in datasets: dataset_dir = os.path.join(data_dir, dataset) fakenews_dir = os.path.join(dataset_dir, 'FakeNewsContent') realnews_dir = os.path.join(dataset_dir, 'RealNewsContent') no_realnews = 0 no_fakenews = 0 no_articles = 0 doc_ind = [] Realnews = sorted(os.listdir(realnews_dir), key=lambda x:int(x.split('-')[0].split('_')[2])) Fakenews = sorted(os.listdir(fakenews_dir), key=lambda x:int(x.split('-')[0].split('_')[2])) print Realnews print Fakenews for realnews in Realnews: if realnews.split('.')[1] != 'py': #with open(os.path.join(fakenews_dir, fakenews), 'r').read() as fd: f = open(os.path.join(realnews_dir, realnews), 'r').read() doc_ind.append(int(realnews.split('-')[0].split('_')[2])-1) if len(f) == 0: dummy_text = "no title" + " No text" if i == 0: X_buzz.append(dummy_text) y_buzz.append(0) else: X_poli.append(dummy_text) y_poli.append(0) no_realnews += 1 continue data = json.loads(f) if i == 0: X_buzz.append(data['title'] + data['text']) y_buzz.append(0) else: X_poli.append(data['title'] + data['text']) y_poli.append(0) no_realnews += 1 for fakenews in Fakenews: if fakenews.split('.')[1] != 'py': #with open(os.path.join(fakenews_dir, fakenews), 'r').read() as fd: f = open(os.path.join(fakenews_dir, fakenews), 'r').read() doc_ind.append(int(fakenews.split('-')[0].split('_')[2])-1) if len(f) == 0: dummy_text = "no title" + " No text" if i == 0: X_buzz.append(dummy_text) y_buzz.append(1) else: X_poli.append(dummy_text) y_poli.append(1) no_fakenews += 1 continue data = json.loads(f) if i == 0: X_buzz.append(data['title'] + data['text']) y_buzz.append(1) else: X_poli.append(data['title'] + data['text']) y_poli.append(1) no_fakenews += 1 no_articles = no_realnews + no_fakenews count_vec = CountVectorizer(ngram_range=(1,2), max_features=10000) #count_vec = TfidfVectorizer(use_idf=True, smooth_idf=True, ngram_range=(1,2), max_features=10000) #svd_model = TruncatedSVD(n_components=25) svd_model = NMF(n_components=45, random_state=42) svd_transformer = Pipeline([('tfidf', count_vec), ('svd', svd_model)]) #svd_transformer = Pipeline([('tfidf', count_vec)]) if i == 0: #X_buzz_lsi = np.array(count_vec.fit_transform(X_buzz).todense()) X_buzz_lsi = svd_transformer.fit_transform(X_buzz) print X_buzz_lsi.shape #pdb.set_trace() #X_buzz_lsi = svd_transformer.fit_transform(X_buzz) print X_buzz_lsi #X_buzz_lsi = X_buzz_lsi.todense() f = open('buzz_lsi.npy', 'w') print X_buzz_lsi.shape #X1 = np.zeros_like(X_buzz_lsi) print no_articles #pdb.set_trace() #for j in xrange(no_articles): # X1[j, :] = X_buzz_lsi[doc_ind[j], :] #X_buzz_lsi = X1 np.save(f, X_buzz_lsi) else: #X_poli_lsi = np.array(count_vec.fit_transform(X_poli).todense()) #print type(X_poli_lsi) X_poli_lsi = svd_transformer.fit_transform(X_poli) print X_poli_lsi.shape #X1 = np.zeros_like(X_poli_lsi) f = open('poli_lsi.npy', 'w') #for j in xrange(no_articles): # X1[j, :] = X_poli_lsi[doc_ind[j], :] #print X1.shape #print no_articles #X_poli_lsi = X1 np.save(f, X_poli_lsi) i += 1 #y_buzz = np.array(y_buzz) #y_poli = np.array(y_poli) return X_buzz_lsi, y_buzz, X_poli_lsi, y_poli
def demo(self): import math import random import numpy as np from sklearn.feature_selection import SelectKBest from sklearn.feature_selection import chi2 from sklearn.feature_selection import VarianceThreshold from sklearn.feature_selection import SelectKBest from numpy import array from sklearn.linear_model import LogisticRegression from sklearn.svm import LinearSVC from sklearn.feature_selection import SelectFromModel from sklearn.ensemble import ExtraTreesClassifier import re from sklearn.pipeline import Pipeline from sklearn.ensemble import RandomForestClassifier from sklearn.preprocessing import PolynomialFeatures from sklearn.linear_model import LinearRegression from sklearn.linear_model import Perceptron import matplotlib matplotlib.use('TkAgg') import matplotlib.pyplot as plt import Tkinter import threading import matplotlib import matplotlib.backends.backend_tkagg with open ('wpbc1.data','r')as open_file: wpbc=open_file.read() wpbc=wpbc.strip() wpbc=re.split('[\n,]',wpbc) for i in range (len(wpbc)): if wpbc[i]=='N': wpbc[i]='0' elif wpbc[i]=='R': wpbc[i]='1' elif wpbc[i]=='?': wpbc[i]='0' wpbc=[wpbc[i:i+35] for i in range (0,len(wpbc),35)] wpbc=np.array(wpbc,dtype=float) X=np.delete(wpbc,[0],axis=1) y=wpbc.T[0] # feature selection # VarianceThreshold sel = VarianceThreshold(threshold=1) sel.fit(X, y) scores1 = sel.variances_ index1 = np.argsort(scores1) n = index1[:-6] X_new_1 = np.delete(X, [n], axis=1) # SelectKBest skb = SelectKBest(chi2, k=3) skb.fit(X, y) scores2 = skb.scores_ index2 = np.argsort(scores2) n = index2[:-6] X_new_2 = np.delete(X, [n], axis=1) # L1 lsvc = LinearSVC(C=0.008, penalty="l1", dual=False) lsvc.fit(X, y) model = SelectFromModel(lsvc, prefit=True) X_new_3 = lsvc.transform(X) scores3 = lsvc.coef_ np.abs(scores3) index3 = np.argsort(scores3) # tree clf = ExtraTreesClassifier() clf.fit(X, y) model = SelectFromModel(clf, prefit=True) scores4 = clf.feature_importances_ index4 = np.argsort(scores4) n = index4[:-6] X_new_4 = np.delete(X, [n], axis=1) # pipline clf = Pipeline([ ('feature_selection', SelectFromModel(LinearSVC(penalty="l2"))), ('classification', RandomForestClassifier()) ]) clf.fit(X, y) X = PolynomialFeatures(interaction_only=True).fit_transform(X_new_1).astype(float) clf = Perceptron(fit_intercept=False, n_iter=10, shuffle=False).fit(X_new_1, y) clf.predict(X_new_1) score1 = clf.score(X_new_1, y) X = PolynomialFeatures(interaction_only=True).fit_transform(X_new_2).astype(float) clf = Perceptron(fit_intercept=False, n_iter=10, shuffle=False).fit(X_new_2, y) clf.predict(X_new_2) score2 = clf.score(X_new_2, y) X = PolynomialFeatures(interaction_only=True).fit_transform(X_new_3).astype(float) clf = Perceptron(fit_intercept=False, n_iter=10, shuffle=False).fit(X_new_3, y) clf.predict(X_new_3) score3 = clf.score(X_new_3, y) X = PolynomialFeatures(interaction_only=True).fit_transform(X_new_4).astype(float) clf = Perceptron(fit_intercept=False, n_iter=10, shuffle=False).fit(X_new_4, y) clf.predict(X_new_4) score4 = clf.score(X_new_4, y) print score1, score2, score3, score4 # 0.00505050505051 0.00505050505051 0.00505050505051 0.00505050505051 # plot '''fig = plt.figure(1)
kFolder = KFold(n_splits=N_FOLDS) fold_count = 0 most_frequent_terms = [] for train_index, test_index in kFolder.split(data): print("Memproses Tweet ke {} - {}".format( test_index[0] + 1, test_index[-1] + 1 )) data_train, target_train = data[train_index], target[train_index] bow_pipeline = Pipeline([ ('count_vectorizer', CountVectorizer(min_df=5, max_df=0.7, )), ('tf_idf_transformer', TfidfTransformer()) ]).fit(data_train) pandas.DataFrame( bow_pipeline['count_vectorizer'].stop_words_ ).sort_values( [0], ignore_index=True ).to_excel( "./report-extras/effective_stop_words_{}.xlsx".format( fold_count ) ) most_frequent_terms_in_this_fold = pandas.DataFrame( bow_pipeline['count_vectorizer'].transform(
from sklearn.preprocessing import PolynomialFeatures from sklearn.preprocessing import StandardScaler from sklearn.base import BaseEstimator #initiated sklearn regressor objects and a couple polynomial pipelines models = { "svr": SVR(), "kr": KernelRidge(), "rf": RandomForestRegressor(), "gb": GradientBoostingRegressor(), "lr": Pipeline([ ("poly", PolynomialFeatures(2)), ("regressor", LinearRegression()), ]), "hr": Pipeline([ ("poly", PolynomialFeatures(2)), ("regressor", HuberRegressor()) ]), "ran": Pipeline([ ("poly", PolynomialFeatures(2)), ("regressor", RANSACRegressor()) ]), "gpr": GaussianProcessRegressor(), "wei": WeightedCurver(maxfev=100000), "sum": SummedCurver(maxfev=2000, method="dogbox"), }
cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] thresh = cm.max() / 2. for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])): plt.text(j, i, cm[i, j], horizontalalignment="center", color="white" if cm[i, j] > thresh else "black") plt.tight_layout() plt.ylabel('True label') plt.xlabel('Predicted label') plt.savefig("MNB_cf"+str(normalize) + ".png") pipeline1 = Pipeline([ ('vect', CountVectorizer(min_df=2, stop_words=text.ENGLISH_STOP_WORDS)), ('tfidf', TfidfTransformer()), ]) train_lsi, test_lsi = fetchLSIRepresentation(pipeline1, twenty_train, twenty_test) mnb_clf = MultinomialNB() mnb_clf.fit(train_lsi, train_target_group) mnb_predicted = mnb_clf.predict(test_lsi) nmb_predicted_probs = mnb_clf.predict_proba(test_lsi) print_statistics(test_target_group, mnb_predicted) fpr, tpr, _ = roc_curve(test_target_group, nmb_predicted_probs[:,1]) plot_roc(fpr, tpr) cnf_matrix = smet.confusion_matrix(test_target_group, mnb_predicted) plot_confusion_matrix(cnf_matrix, classes=class_names, title='Confusion matrix without normalization') plot_confusion_matrix(cnf_matrix, classes=class_names, normalize=True, title='Confusion matrix with normalization' )
X, y = load_digits(return_X_y=True) # Throw away data, to be in the curse of dimension settings X = X[:200] y = y[:200] n_samples = len(y) X = X.reshape((n_samples, -1)) # add 200 non-informative features X = np.hstack((X, 2 * np.random.random((n_samples, 200)))) # ############################################################################# # Create a feature-selection transform and an instance of SVM that we # combine together to have an full-blown estimator transform = SelectPercentile(chi2) clf = Pipeline([('anova', transform), ('svc', SVC(gamma="auto"))]) # ############################################################################# # Plot the cross-validation score as a function of percentile of features score_means = list() score_stds = list() percentiles = (1, 3, 6, 10, 15, 20, 30, 40, 60, 80, 100) for percentile in percentiles: clf.set_params(anova__percentile=percentile) # Compute cross-validation score using 1 CPU this_scores = cross_val_score(clf, X, y, n_jobs=1) score_means.append(this_scores.mean()) score_stds.append(this_scores.std()) plt.errorbar(percentiles, score_means, np.array(score_stds))
from sklearn.grid_search import GridSearchCV with open('dataset.csv') as csvfile: reader = csv.DictReader(csvfile) ip = [] target = [] count = 1 for row in reader: target.append(row['Sentiment']) ip.append(row['SentimentText']) count += 1 if (count == 10000): break pipeline = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('logReg', LogisticRegression())]) parameters = { 'vect__max_df': (0.5, 0.75, 1.0), #'vect__max_features': (None, 5000, 10000, 50000), 'vect__ngram_range': ((1, 1), (1, 2)), # unigrams or bigrams 'tfidf__use_idf': (True, False), 'tfidf__norm': ('l1', 'l2'), #'clf__alpha': (0.00001, 0.000001), #'clf__penalty': ('l2', 'elasticnet') #'clf__n_iter': (10, 50, 80), 'logReg__max_iter': (10, 50, 100), 'logReg__class_weight': ('auto', 'balanced') }
from sklearn.naive_bayes import MultinomialNB from sklearn.pipeline import Pipeline from sklearn.feature_extraction.text import TfidfVectorizer def clean(s): translator = str.maketrans("", "", string.punctuation) return s.translate(translator) s = session() rows = s.query(News).filter(News.label != None).all() X = [clean(row.title).lower() for row in rows] y = [row.label for row in rows] limit = int(len(rows) * 0.7) X_train, y_train, X_test, y_test = X[:limit], y[:limit], X[limit:], y[limit:] print('Testing my model...') my_model = NaiveBayesClassifier(alpha=0.05) my_model.fit(X_train, y_train) print(my_model.score(X_test, y_test)) print('Testing sklearn...') sk_model = Pipeline([ ('vectorizer', TfidfVectorizer()), ('classifier', MultinomialNB(alpha=0.05)), ]) sk_model.fit(X_train, y_train) print(sk_model.score(X_test, y_test))
cm_light = mpl.colors.ListedColormap(['#77E0A0', '#FF8080', '#A0A0FF']) cm_dark = mpl.colors.ListedColormap(['g', 'r', 'b']) mpl.rcParams['font.sans-serif'] = u'SimHei' mpl.rcParams['axes.unicode_minus'] = False plt.figure(facecolor='w') plt.scatter(x[:, 0], x[:, 1], s=30, c=y, marker='o', cmap=cm_dark) plt.grid(b=True, ls=':') plt.xlabel(u'组份1', fontsize=14) plt.ylabel(u'组份2', fontsize=14) plt.title(u'鸢尾花数据PCA降维', fontsize=18) # plt.savefig('1.png') plt.show() x, x_test, y, y_test = train_test_split(x, y, train_size=0.7) model = Pipeline([ ('poly', PolynomialFeatures(degree=2, include_bias=True)), ('lr', LogisticRegressionCV(Cs=np.logspace(-3, 4, 8), cv=5, fit_intercept=False)) ]) model.fit(x, y) print '最优参数:', model.get_params('lr')['lr'].C_ y_hat = model.predict(x) print '训练集精确度:', metrics.accuracy_score(y, y_hat) y_test_hat = model.predict(x_test) print '测试集精确度:', metrics.accuracy_score(y_test, y_test_hat) N, M = 500, 500 # 横纵各采样多少个值 x1_min, x1_max = extend(x[:, 0].min(), x[:, 0].max()) # 第0列的范围 x2_min, x2_max = extend(x[:, 1].min(), x[:, 1].max()) # 第1列的范围 t1 = np.linspace(x1_min, x1_max, N) t2 = np.linspace(x2_min, x2_max, M) x1, x2 = np.meshgrid(t1, t2) # 生成网格采样点 x_show = np.stack((x1.flat, x2.flat), axis=1) # 测试点
from sklearn.preprocessing import MinMaxScaler from sklearn.preprocessing import Binarizer from sklearn.decomposition import PCA from sklearn.preprocessing import PolynomialFeatures from sklearn.neural_network import MLPRegressor from sklearn.linear_model import Ridge from sklearn.linear_model import Lasso from sklearn.svm import SVC from sklearn.neighbors import KNeighborsClassifier from sklearn.neighbors import KNeighborsRegressor from sklearn.tree import DecisionTreeClassifier from sklearn.ensemble import RandomForestClassifier tree_classification_pipeline = Pipeline([ ('tree', DecisionTreeClassifier()), # Forest instead of Trees # ('forest', RandomForestClassifier()) ]) ridge_regression_pipeline = Pipeline([ # Apply scaling to Ridge Regression # ('scale', StandardScaler()), ('ridge', Ridge()) ]) lasso_regression_pipeline = Pipeline([ # Apply scaling to Lasso Regression # ('scale', StandardScaler()), ('lasso', Lasso()) ])
X = df.drop('MEDV', axis=1) y = df['MEDV'] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42) model = GradientBoostingRegressor() model.fit(X_train, y_train) y_pred = model.predict(X_test) print(f'Accuracy: {model.score(X_test, y_test)*100:.3}%') print(f'RMSE: {np.sqrt(mean_squared_error(y_test, y_pred))}') steps = [('Gradient Boosting Regressor', GradientBoostingRegressor(n_estimators=500, max_depth=6))] model = Pipeline(steps) model.fit(X_train, y_train) print('Accuracy: {:.0f}%'.format(model.score(X_test, y_test) * 100)) dump(model, 'sklearn_model.pkl') """# **Keras Models**""" from keras.models import Sequential from keras.layers import Dense, Dropout from keras.optimizers import RMSprop from sklearn.metrics import r2_score scaler = MinMaxScaler() X_scaled = scaler.fit_transform(X) X_train, X_test, y_train, y_test = train_test_split(X_scaled, y,
'lr__penalty': ['none'], } ] data['param_grid'] = param_grid # Define pipeline and cross-validation setup pipeline = Pipeline( [ ('pt', SubsetPeaksTransformer(n_peaks=0)), ('bv', BinningVectorizer(n_bins=3600, min_bin=2000, max_bin=20000)), ('std', StandardScaler()), ( 'lr', LogisticRegression( class_weight='balanced', solver='saga' # supports L_1 and L_2 penalties )) ], memory=os.getenv('TMPDIR', default=None), ) grid_search = GridSearchCV( pipeline, param_grid=param_grid, scoring='average_precision', cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42), n_jobs=-1, )