def class_prediction_error( model, X, y=None, ax=None, classes=None, test_size=0.2, random_state=None, **kwargs): """Quick method: Divides the dataset X and y into train and test splits, fits the model on the train split, then scores the model on the test split. The visualizer displays the support for each class in the fitted classification model displayed as a stacked bar plot Each bar is segmented to show the distribution of predicted classes for each class. This helper function is a quick wrapper to utilize the ClassPredictionError ScoreVisualizer for one-off analysis. Parameters ---------- model : the Scikit-Learn estimator (should be a classifier) X : ndarray or DataFrame of shape n x m A matrix of n instances with m features. y : ndarray or Series of length n An array or series of target or class values. ax : matplotlib axes The axes to plot the figure on. classes : list of strings The names of the classes in the target test_size : float, default=0.2 The percentage of the data to reserve as test data. random_state : int or None, default=None The value to seed the random number generator for shuffling data. Returns ------- ax : matplotlib axes Returns the axes that the class prediction error plot was drawn on. """ # Instantiate the visualizer visualizer = ClassPredictionError(model, ax, classes, **kwargs) # Create the train and test splits X_train, X_test, y_train, y_test = tts( X, y, test_size=test_size, random_state=random_state ) # Fit and transform the visualizer (calls draw) visualizer.fit(X_train, y_train, **kwargs) visualizer.score(X_test, y_test) # Return the axes object on the visualizer return visualizer.ax
def test_residuals_plot_pandas(self): """ Test Pandas real world dataset with image similarity on Lasso """ _, ax = plt.subplots() # Load the occupancy dataset from fixtures data = self.load_data('energy') target = 'heating_load' features = [ "relative_compactness", "surface_area", "wall_area", "roof_area", "overall_height", "orientation", "glazing_area", "glazing_area_distribution" ] # Create instances and target X = pd.DataFrame(data[features]) y = pd.Series(data[target].astype(float)) # Create train/test splits splits = tts(X, y, test_size=0.2, random_state=231) X_train, X_test, y_train, y_test = splits visualizer = ResidualsPlot(Lasso(random_state=44), ax=ax) visualizer.fit(X_train, y_train) visualizer.score(X_test, y_test) visualizer.finalize() self.assert_images_similar(visualizer, tol=1, remove_legend=True)
def make_fixture(binary=False, balanced=False, split=False): """ Make a dataset for testing ClassBalance based on the specified params. """ kwargs = { "n_samples":100, "n_features":20, "n_informative":8, "n_redundant":2, "n_clusters_per_class":1, "random_state":89092, } if binary: kwargs['n_classes'] = 2 kwargs['weights'] = None if balanced else [0.3, 0.7] else: kwargs['n_classes'] = 5 kwargs['weights'] = None if balanced else [0.1, 0.2, 0.4, 0.2, .01] X, y = make_classification(**kwargs) if split: X_train, X_test, y_train, y_test = tts( X, y, test_size=0.2, random_state=101 ) return Dataset(Split(X_train, X_test), Split(y_train, y_test)) return Dataset(X, y)
def make_fruit_dataset(): X, y = make_classification( n_samples=1000, n_classes=5, n_informative=3, n_clusters_per_class=1 ) classes = ['apple', 'kiwi', 'pear', 'banana', 'orange'] return tts(X, y, test_size=0.20, random_state=42), classes
def make_dataset(): data = pd.read_csv("../../../examples/data/occupancy/occupancy.csv") X = data[["temperature", "relative humidity", "light", "C02", "humidity"]] y = data.occupancy return tts(X, y, test_size=0.2)
def test_pandas_integration(self): """ Test with Pandas DataFrame and Series input """ _, ax = plt.subplots() # Load the occupancy dataset from fixtures data = self.load_data('occupancy') target = 'occupancy' features = [ "temperature", "relative_humidity", "light", "C02", "humidity" ] # Create instances and target X = pd.DataFrame(data[features]) y = pd.Series(data[target].astype(int)) # Create train/test splits splits = tts(X, y, test_size=0.2, random_state=8873) X_train, X_test, y_train, y_test = splits # Create confusion matrix model = GaussianNB() cm = ConfusionMatrix(model, ax=ax, classes=None) cm.fit(X_train, y_train) cm.score(X_test, y_test) tol = 0.1 if six.PY3 else 40 self.assert_images_similar(cm, tol=tol) # Ensure correct confusion matrix under the hood npt.assert_array_equal(cm.confusion_matrix_, np.array([ [3012, 114], [ 1, 985] ]))
def linearSVR(data): X = data.drop(["id", "date", "price","long","lat", "zipcode","yr_renovated", "sqft_above", "sqft_basement"], axis=1) y = data["price"] X_train, X_test, y_train, y_test = tts(X, y, test_size=0.10, random_state=42) svr = LinearSVR(random_state=42) svr.fit(X_train, y_train) y_predict = svr.predict(X_test) print "r2-score for LinearSVR: %f" % r2_score(y_test, y_predict)
def ridgeRegression(data): from sklearn.linear_model import Ridge X = data.drop(["id", "date", "price","long","lat", "zipcode","yr_renovated","sqft_above","sqft_basement"], axis=1) y = data["price"] X_train, X_test, y_train, y_test = tts(X, y, test_size=0.10, random_state=42) ridge = Ridge(random_state=42) ridge.fit(X_train, y_train) y_predict = ridge.predict(X_test) print "r2-score for Ridge Regression: %f" % r2_score(y_test, y_predict)
def load_credit_dataset(): data = pd.read_csv("../../../examples/data/credit/credit.csv") target = "default" features = list(data.columns) features.remove(target) X = data[features] y = data[target] classes = ["default", "current"] return tts(X, y, test_size=0.2, random_state=53), classes
def digits(request): """ Creates a fixture of train and test splits for the sklearn digits dataset For ease of use returns a Dataset named tuple composed of two Split tuples. """ data = load_digits() X_train, X_test, y_train, y_test = tts( data.data, data.target, test_size=0.2, random_state=11 ) # Set a class attribute for digits request.cls.digits = Dataset( Split(X_train, X_test), Split(y_train, y_test) )
def data(request): """ Creates a fixture of train and test splits for the sklearn digits dataset For ease of use returns a Dataset named tuple composed of two Split tuples. """ X, y = make_regression( n_samples=500, n_features=22, n_informative=8, random_state=42, noise=0.2, bias=0.2, ) X_train, X_test, y_train, y_test = tts( X, y, test_size=0.2, random_state=11 ) # Set a class attribute for digits request.cls.data = Dataset( Split(X_train, X_test), Split(y_train, y_test) )
def test_pandas_integration(self): """ Test with Pandas DataFrame and Series input """ _, ax = plt.subplots() # Load the occupancy dataset from fixtures data = self.load_data('occupancy') target = 'occupancy' features = [ "temperature", "relative_humidity", "light", "C02", "humidity" ] # Create instances and target X = pd.DataFrame(data[features]) y = pd.Series(data[target].astype(int)) # Create train/test splits splits = tts(X, y, test_size=0.2, random_state=4512) X_train, X_test, y_train, y_test = splits classes = ['unoccupied', 'occupied'] # Create classification report model = GaussianNB() viz = ClassificationReport(model, ax=ax, classes=classes) viz.fit(X_train, y_train) viz.score(X_test, y_test) self.assert_images_similar(viz, tol=43.0) # Ensure correct classification scores under the hood! assert viz.scores_ == { 'precision': { 'unoccupied': 0.999347471451876, 'occupied': 0.8825214899713467 }, 'recall': { 'unoccupied': 0.9613935969868174, 'occupied': 0.9978401727861771 }, 'f1': { 'unoccupied': 0.9800031994880819, 'occupied': 0.9366447034972124 }}
def cluster_regressors(data): X = data.drop(["id", "date", "price", "sqft_above", "sqft_basement"], axis=1) y = data["price"] ## split into train and test set X_train, X_test, y_train, y_test = tts(X, y, test_size=0.10, random_state=42) X_train_clustered = cluster_data(X_train) describe_cluster(features=["waterfront", "view", "condition", "grade", "yr_built", "yr_renovated"], X=X_train_clustered, n_clusters=5) describe_cluster(features=["bedrooms", "bathrooms", "floors"], X=X_train_clustered, n_clusters=4) ## Train optimized regressors regressors = train_optimized_regressors(X_train_clustered, y_train) y_predict = predict_optimized(regressors=regressors, X=X_test) r2_optimized = r2_score(y_test, y_predict) print "r2-score for Clustered Regressors: %.4f" % r2_optimized
def test_pandas_occupancy_compare(self): """ Test pandas data frame with string target in compare mode """ data = self.load_data("occupancy") features = [ "temperature", "relative_humidity", "light", "C02", "humidity" ] X = pd.DataFrame(data[features]) y = pd.Series([ "occupied" if yi else "unoccupied" for yi in data['occupancy'] ]) _, _, y_train, y_test = tts(X, y, test_size=0.4, random_state=2242) # Create and fit the visualizer oz = ClassBalance() assert oz.fit(y_train, y_test) is oz #oz.finalize() self.assert_images_similar(oz)
def test_score_returns_score(self): """ Test that ConfusionMatrix score() returns a score between 0 and 1 """ data = self.load_data("occupancy") X = data[[ "temperature", "relative_humidity", "light", "C02", "humidity" ]] y = data['occupancy'] # Convert X to an ndarray X = X.copy().view((float, len(X.dtype.names))) X_train, X_test, y_train, y_test = tts(X, y, test_size=0.2, random_state=42) # Create and fit the visualizer visualizer = ConfusionMatrix(LogisticRegression()) visualizer.fit(X_train, y_train) # Score the visualizer s = visualizer.score(X_test, y_test) assert 0 <= s <= 1
tmp = [] for each in train['Sex']: if each == 'female': tmp.append(1) elif each == 'male': tmp.append(0) else: tmp.append(np.nan) train['Sex'] = tmp y = train.iloc[:, [0]].values x = train.iloc[:, [1, 3, 4, 5, 6, 8]].values x_train, x_test, y_train, y_test = tts(x, y, train_size=0.8, random_state=66) #1.데이터 구성 #2) 모델 model = XGBClassifier(n_estimators=1000) #3) 트레이닝 model.fit(x_train, y_train, eval_metric="rmse") # model.feature_importance_ thresholds = np.sort(model.feature_importances_) score_acc = model.score(x_test, y_test) max = -1
# visualize how they look num_classes = len(np.unique(y)) ## class 1 for ind, val in enumerate (np.unique(y)): plt.scatter (x[y==val,0], x[y==val,1], marker = marker_list_all[ind], c = color_list_all[ind], label='Class '+str(val)) plt.legend (loc = 0) plt.xlim (x[:,0].min(), x[:,0].max()) plt.ylim (x[:,1].min(), x[:,1].max()) plt.tight_layout () pic1 = 'scatter-show.pdf' plt.savefig (pic1) plt.show () # separating data set xtr, xte, ytr, yte = tts (x, y, test_size = 0.3) # standarizing the data sc0 = SC () sc0.fit (xtr) xtr_std = sc0.transform (xtr) xte_std = sc0.transform (xte) # The following is for classifying dtc = DTC() dtc.fit (xtr_std, ytr) ypd = dtc.predict (xte_std) print ("accuracy: ", dtc.score (xte_std, yte)) pdb (x, y, classifier=dtc, standardizer=sc0)
# # ('bayes', MultinomialNB()) # ]) # model.fit(docs, labels) # model.predict(gensim_docs) normal = TextNormalizer() norm_docs = list(normal.fit_transform(docs)) # documents = norm_docs # id2word = gensim.corpora.Dictionary(documents) # taggeddoc = [ TaggedDocument(words, ['d{}'.format(idx)]) for idx, words in enumerate(documents) ] # model = Doc2Vec(taggeddoc, vector_size=5, window=2, min_count=1, workers=4) # docvecs = model.docvecs.vectors_docs gensim = GensimTfidfVectorizer(type='tfidf') gensim_docs = gensim.fit_transform(norm_docs) X_train, X_test, y_train, y_test = tts(gensim_docs, y, test_size=0.2) clf = LogisticRegressionCV() clf.fit(X_train, y_train) y_pred = clf.predict(X_test) f1_score(y_pred, y_test) accuracy_score(y_pred, y_test) from gensim.models import Word2Vec from gensim.models.phrases import Phraser, Phrases common_terms = ["of", "with", "without", "and", "or", "the", "a"] # Create the relevant phrases from the list of sentences: phrases = Phrases(norm_docs, common_terms=common_terms) bigram = Phraser(phrases)
np.save('./data/dacon/comp3/train_target.npy', arr=y) #1-5. id 컬럼 제외 데이터 슬라이싱 x = train[:, 1:] x_pred = test[:, 1:] #1-6. Scaler 후 reshape scaler = StandardScaler() x = scaler.fit_transform(x) x_pred = scaler.fit_transform(x_pred) x = x.reshape(2800, 375, 4, 1) x_pred = x_pred.reshape(700, 375, 4, 1) #1-7. train_test_split x_train, x_test, y_train, y_test = tts(x, y, random_state=88, test_size=0.2) #2. 모델 구성 input1 = Input(shape=(375, 4, 1)) dense1 = Conv2D(175, (3, 3), padding='same')(input1) dense1 = Conv2D(75, (2, 2), padding='same')(dense1) dense1 = MaxPooling2D(pool_size=2, padding='same')(dense1) dense1 = Dropout(0.2)(dense1) dense1 = Conv2D(75, (2, 2), padding='same')(dense1) dense1 = Conv2D(15, (2, 2), padding='same')(dense1) dense1 = Flatten()(dense1) output1 = Dense(4)(dense1) model = Model(inputs=input1, outputs=output1) #3. 컴파일, 훈련
import numpy as np dataset = load_boston() # print(type(load_boston())) #<class 'sklearn.utils.Bunch'> # print(print(dataset.keys())) x= dataset.data y= dataset.target print(type(x)) #<class 'numpy.ndarray'> # print(x.shape) from sklearn.model_selection import train_test_split as tts x_train,x_test,y_train,y_test = tts(x,y,train_size=0.8) n_estimators = 100 learning_rate = 0.01 colsample_bytree=0.9#우승모델은 0.6~0.9 colsample_bylevel=0.9#우승모델은 0.6~0.9 max_depth = 5 n_jobs=-1 parameters = { "n_estimators" : np.arange(100,301,100), "learning_rate" : np.arange(0.01,0.03,0.01), "colsample_bytree":np.arange(0.6,1,0.1), "colsample_bylevel":np.arange(0.6,1,0.1), "max_depth" : [4,5,6] }
import numpy as np from flask import Flask, request, jsonify, render_template import pickle from sklearn.preprocessing import RobustScaler as RS import pandas as pd scaler = RS() n = 0 app = Flask(__name__) model = pickle.load(open('modelfin.pkl', 'rb')) data = pd.read_csv(r'C:\Users\Sarvesh\pop.csv') X = data.drop(['popularity', 'ratio', 'categoryId', 'Unnamed: 0'], axis=1) Y = data['popularity'] from sklearn.model_selection import train_test_split as tts X_train, X_test, Y_train, Y_test = tts(X, Y, test_size=0.20, shuffle=True) scaler.fit(X_train) @app.route('/') def home(): return render_template('yt.htm') @app.route('/predict', methods=['POST']) def predict(): features = [float(x) for x in request.form.values()] final_features = [np.array(features)] f = scaler.transform(final_features) prediction = model.predict(f) if (prediction[0] == 0): output = "High" if (prediction[0] == 2):
weights=[0.3, 0.7], n_informative=3, n_redundant=1, flip_y=0, n_features=5, n_clusters_per_class=1, n_samples=5000, random_state=10, ) # Instanciate a PCA object for the sake of easy visualisation pca = PCA(n_components=2) # Create the samplers enn = EditedNearestNeighbours() renn = RepeatedEditedNearestNeighbours() # Create the classifier knn = KNN(1) # Make the splits X_train, X_test, y_train, y_test = tts(X, y, random_state=42) # Add one transformers and two samplers in the pipeline object pipeline = make_pipeline(pca, enn, renn, knn) pipeline.fit(X_train, y_train) y_hat = pipeline.predict(X_test) print(classification_report(y_test, y_hat))
from sklearn.preprocessing import Imputer #creating function variable using Imputer imputer = Imputer(missing_values='NaN', strategy='mean', axis=0) #attaching the variable imputer to our matrix y imputer = imputer.fit(y[:, :]) #now we apply our imputer variable on matrix to fill in the #missing values will be filled with the strategy we picked #fit() used to apply changes on a temp var in memory #transform() used to commit the changes to the said variable #fit_transform() for doing both together y = imputer.transform(y) #%% #Splitting dataset to training and test set ''' Training Set- from which the model will learn from Test -with which it will compare itself and check itself ''' from sklearn.model_selection import train_test_split as tts y_train, y_test = tts(y, test_size=0.2, random_state=0) #%% #Feature Scaling- it scales the entries so that all columns are comparable to #same scale from sklearn.preprocessing import StandardScaler as ss sc_x = ss() x_train = sc_x.fit_transform(x_train) x_test = sc_x.transform(x_test)
#datainterpretation data.info() print(data.describe()) #createarrays #x:all independent data #y:Outcome(depenedent data) x = data.iloc[:, :-1].values y = data.iloc[:, -1].values #splituniversaldataset(train:test) #library:sklearn #module:model_selection #classtrain_test_split from sklearn.model_selection import train_test_split as tts x_train, x_test, y_train, y_test = tts(x, y, test_size=0.3, random_state=3) #algorithmselection #linearregression #library:sklearn #module:linear_model #class:LinearRegression from sklearn.linear_model import LinearRegression as linreg model_linreg = linreg() #trainthemodel model_linreg.fit(x_train, y_train) #Testthemodel #predictingoutput y_pred = model_linreg.predict(x_test)
with open(email, encoding="iso8859_1") as f: words = f.read().split(' ') for entry in dictionary: data.append(words.count(entry[0])) feature_set.append(data) if "ham" in email: labels.append(0) if "spam" in email: labels.append(1) return feature_set, labels d = make_dict() features, labels = make_dataset(d) x_train, x_test, y_train, y_test = tts(features, labels, test_size=0.2) # 80% for data training #clasifier clf = MultinomialNB() clf.fit(x_train, y_train) preds = clf.predict(x_test) print(accuracy_score(y_test, preds)) save(clf, "text-classifier.mdl") while True: features = [] inp = input(">").split() if inp[0] == "exit": break for word in d: features.append(inp.count(word[0]))
def build_and_evaluate(balanced, X, y, classifier=LogisticRegression, outpath=None, verbose=True): def build(balanced, classifier, X, y=None): """ Inner build function that builds a single model. """ if isinstance(classifier, type): # classifier = classifier() if balanced == True: class_weight = 'balanced' # neg_count = 0 # neu_count = 0 # pos_count = 0 # for label in y: # if label == 0: # neg_count += 1 # elif label == 1: # neu_count += 1 # elif label == 2: # pos_count += 1 # # if(len(set(y))) == 3: # minimum = min(neg_count, neu_count, pos_count) # class_weight = {0: minimum/neg_count, 1: minimum/neu_count, 2: minimum/pos_count} # elif (len(set(y))) == 2: # pos_count = neu_count # minimum = min(neg_count, pos_count) # class_weight = {0: minimum/neg_count, 1: minimum/pos_count } # print('0:', neg_count, '1:', neu_count, '2:', pos_count) # print(class_weight) else: class_weight = None classifier = classifier(multi_class='multinomial', solver='saga', class_weight=class_weight) # classifier = classifier(max_iter=1000, class_weight = class_weight) # classifier = classifier(class_weight=class_weight, C=1) # classifier = classifier(class_weight = class_weight) model = Pipeline([ ('preprocessor', NLTKPreprocessor()), # ('vectorizer', CountVectorizer(tokenizer=identity,preprocessor=None,lowercase=None,ngram_range =(1,2))), ('vectorizer', TfidfVectorizer(tokenizer=identity, preprocessor=None, lowercase=None, ngram_range=(1, 2))), # ('feature_selection', SelectFromModel(RandomForestClassifier(n_estimators=100, random_state=42), threshold='median')), # ('feature_selection', SelectPercentile(percentile=50)), ('feature_selection', SelectPercentile(score_func=chi2, percentile=90)), # ('to_dense', DenseTransformer()), # ('standardization', StandardScaler(with_mean=False)), # ('feature_selection', VarianceThreshold(threshold=(.8 * (1 - .8)))), ('classifier', classifier), ]) # parameters = { # # 'vectorizer__max_features': [85000,100000,125000,150000] # # 'vectorizer__max_df': [0.5,0.6,0.7,0.8] # # 'classifier__loss': ['log', 'modified_huber', 'squared_hinge', 'perceptron'] # 'classifier__multi_class': ['multinomial', 'ovr'], # 'classifier__solver': ['newton-cg', 'sag', 'saga', 'lbfgs'] # } # grid = GridSearchCV(model,param_grid=parameters) # grid.fit(X,y) # # print("Best: %f using %s" % (grid.best_score_, # grid.best_params_)) # means = grid.cv_results_['mean_test_score'] # stds = grid.cv_results_['std_test_score'] # params = grid.cv_results_['params'] # for mean, stdev, param in zip(means, stds, params): # print("%f (%f) with: %r" % (mean, stdev, param)) # return grid model.fit(X, y) return model # Label encode the targets labels = LabelEncoder() y = labels.fit_transform(y) # Begin evaluation if verbose: print("Building for evaluation") X_train, X_test, y_train, y_test = tts(X, y, test_size=0.2, random_state=0) model = build(balanced, classifier, X_train, y_train) y_pred = model.predict(X_test) y_actual = pd.Series(y_test, name='Actual') y_predicted = pd.Series(y_pred, name='Predicted') df_confusion = pd.crosstab(y_actual, y_predicted, rownames=['Actual'], colnames=['Predicted'], margins=True) if verbose: print("Confusion Matrix:\n") print(df_confusion) if verbose: print("Classification Report:\n") print(clsr(y_test, y_pred, target_names=labels.classes_, digits=4)) print( accuracy_score(y_test, y_pred, normalize=True, sample_weight=None) * 100) # seed = 7 # kfold = StratifiedKFold(n_splits=5) # scores = cross_val_score(model, X_train, y_train, cv=kfold) # print(scores) # print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2)) if verbose: print("Building complete model and saving ...") model = build(balanced, classifier, X, y) model.labels_ = labels if outpath: with open(outpath, 'wb') as f: pickle.dump(model, f) print("Model written out to {}".format(outpath)) return model
def train_test_split(features, labels, random_state): global features_train, features_test, labels_train, labels_test features_train, features_test, labels_train, labels_test = tts( features, labels, random_state)
from sklearn import linear_model import matplotlib.pyplot as plt from sklearn.model_selection import train_test_split as tts # In[2]: data = np.genfromtxt('linear.csv', delimiter=',') x_data = data[1:, 0, np.newaxis] y_data = data[1:, 1, np.newaxis] plt.scatter(x_data, y_data) plt.show # In[3]: x_train, x_test, y_train, y_test = tts(x_data, y_data, test_size=0.2) # In[4]: model = linear_model.LinearRegression() model.fit(x_train, y_train) # In[5]: plt.scatter(x_data, y_data) plt.plot(x_data, model.predict(x_data), c='r') plt.show # In[6]: model.score(x_test, y_test)
def split_dataset(df): X = df.drop('G3', axis=1) y = df['G3'] x_train, x_test, y_train, y_test = tts(X, y, test_size=0.2) return x_train, x_test, y_train, y_test
def _make_dataset(X, y, split=False): if split: X_train, X_test, y_train, y_test = tts(X, y, test_size=0.2) return Dataset(Split(X_train, X_test), Split(y_train, y_test)) return Dataset(X, y)
# In[73]: X = df[['OverallQual', 'TotalBsmtSF', 'GrLivArea', 'GarageArea', '1stFlrSF']] # In[74]: y = df['SalePrice'] # In[75]: from sklearn.model_selection import train_test_split as tts # In[76]: X_train, X_test, y_train, y_test = tts(X, y, test_size=0.3, random_state=42) y_pred = reg.predict(X_test) # In[77]: from sklearn.linear_model import LinearRegression # In[78]: reg = LinearRegression() # In[79]: reg.fit(X_train, y_train) # In[80]:
def __buildTrainingAndEvalDatasets(self): ################################################################################### # This method builds the train and eval datasets from the given pickled data file. ################################################################################### global HEADER_COLS, EMPTY_BUSINESS_ITEM_MSG try: # Check if the train file exists if os.path.exists(self.trainFile) is False: log.error(f"Pickle file '{self.trainFile}' does not exist!") return False # Check if the file has been read successfully dfPckl = pandas.read_pickle(self.trainFile) if not dfPckl is None: totalInitRows = len(dfPckl) log.debug(dfPckl[HEADER_COLS].head(50)) time.sleep(60) # Remove rows corresponding to the sectors with the following labels: # "cosmetics & fragrance inc", "herman inc", "ruger & company", "inc", "inc.", # "inc (formerly acxiom)", "inc. (staten island", "incorporated", "ltd." and "na". log.debug(f"Removing rows where '{HEADER_COLS[1]} != {', '.join(SECTOR_LABELS)}'") dfPckl = dfPckl[dfPckl.eval(HEADER_COLS[1]).isin(SECTOR_LABELS)] log.debug(dfPckl[HEADER_COLS].head(50)) time.sleep(60) # Remove rows corresponding to the value: "No business text found" in the column "text". log.debug(f"Removing rows where '{HEADER_COLS[0]} == {EMPTY_BUSINESS_ITEM_MSG}'..") dfPckl = dfPckl[dfPckl.eval(HEADER_COLS[0]).str.lower() != EMPTY_BUSINESS_ITEM_MSG.lower()] log.debug(dfPckl[HEADER_COLS].head(50)) time.sleep(60) log.debug(f"Total records in the dataframe: {totalInitRows}.") log.debug(f"Total records in the dataframe that were removed: {totalInitRows - len(dfPckl)}.") log.debug(f"Total records in the dataframe for training and evaluation: {len(dfPckl)}.") # Apply pre-processing on the "text" column on multiple processors log.debug(f"Applying pre-processing on the '{HEADER_COLS[0]}' column..") with mproc.Pool(NUM_CPUs) as p: dfPckl[HEADER_COLS[0]] = p.map(preprocessSequenceWithoutBreakingSentence, [text for text in dfPckl[HEADER_COLS[0]]]) log.debug(dfPckl[HEADER_COLS].head(50)) time.sleep(60) # Save the pre-processed dataframe to a pickle file try: preProcPckl = os.path.join(os.path.split(self.trainFile)[0], os.path.split(self.trainFile)[1].split(".")[0] + ".preproc.pkl") dfPckl.to_pickle(preProcPckl) log.info(f"Successfully saved the pre-processed training file to '{preProcPckl}'.") except: log.error("Error saving the pre-processed dataset to file.") # Split into train and eval datasets self.trainDataset, self.evalDataset = tts(dfPckl, test_size=0.33, shuffle=True, random_state=42) if not self.trainDataset is None and not self.evalDataset is None: log.debug(f"Total records in the dataframe for training are '{len(self.trainDataset)}'.") log.debug(f"Total records in the dataframe for evaluation are '{len(self.evalDataset)}'.") log.info(f"Successfully generated train and eval datasets.") return True else: log.error(f"Error generating train and eval datasets. Cannot continue with finetuning.") return False else: log.error(f"Error reading the pickle file '{self.trainFile}'. Cannot continue with finetuning.") return False except: exc_type, exc_value, exc_traceback = sys.exc_info() err = f"** ERROR ** Error occurred while building training and eval datasets from the pickled file '{self.trainFile}'. Error is: {str(exc_type)}; {str(exc_value)}." raise Exception(err)
import numpy as np import pandas as pd from sklearn.linear_model import LogisticRegression from sklearn.datasets import load_iris from sklearn.model_selection import train_test_split as tts import pickle dataIris = load_iris() df = pd.DataFrame(dataIris["data"], columns=["SL", "SW", "PL", "PW"]) df["target"] = dataIris["target"] df["species"] = df["target"].apply(lambda x: dataIris["target_names"][x]) xtr, xts, ytr, yts = tts(df[["SL", "SW", "PL", "PW"]], df["species"], test_size=.1) model = LogisticRegression(solver="lbfgs", multi_class="auto", max_iter=100000) model.fit(xtr, ytr) with open("modelPickle", "wb") as modPkl: pickle.dump(model, modPkl)
os.chdir("D:/GreyAtom/Datasets") df = pd.read_csv("wbc.csv") # In[22]: X = df.drop(["diagnosis", "Unnamed: 32"], axis = 1) y = df["diagnosis"] # In[23]: X_train, X_test, y_train, y_test = tts(X, y, test_size=0.3, stratify=y, random_state=1) # In[24]: dtc = DecisionTreeClassifier(max_depth=4, min_samples_leaf=0.16, random_state=1) # In[25]: bc = BaggingClassifier(base_estimator=dtc, n_estimators=300, n_jobs=1) bc.fit(X_train, y_train)
text = ' '.join(words) return text print('Cleaning of the data taking place....') data['Text'] = data['Text'].map(cleaning) data['Score'] = data['Score'].replace({'positive':0,'negative':1}) x = data['Text'].values y = data['Score'].values # splitting the data xtrain, xtest, ytrain, ytest = tts(x,y,test_size=0.2,stratify=y) # converting it to categorical variable ytrain = to_categorical(ytrain) ytest = to_categorical(ytest) # converting to text to sequences tokenizer = Tokenizer(25000,lower=True,oov_token='UNK') tokenizer.fit_on_texts(xtrain) xtrain = tokenizer.texts_to_sequences(xtrain) xtest = tokenizer.texts_to_sequences(xtest) xtrain = pad_sequences(xtrain,maxlen=100,padding='post') xtest = pad_sequences(xtest,maxlen=100,padding='post') print("Data preprocessing is over....")
def ensemble_tsfresh(forecast_in, forecast_out, season, perd): #### Create rolled time series for ts feature extraction def tsfresh_run(forecast, season, insample=True, forecast_out=None): df_roll_prep = forecast.reset_index() if insample: df_roll_prep = df_roll_prep.drop(["Target", "Date"], axis=1) df_roll_prep["id"] = 1 target = forecast["Target"] else: df_roll_prep = df_roll_prep.drop(["index"], axis=1) df_roll_prep["id"] = 1 df_roll = roll_time_series(df_roll_prep, column_id="id", column_sort=None, column_kind=None, rolling_direction=1, max_timeshift=season - 1) counts = df_roll['id'].value_counts() df_roll_cut = df_roll[df_roll['id'].isin( counts[counts >= season].index)] ### TS feature extraction concat_df = pd.DataFrame() #rap = 4 ## Change this to suit your memory capacity, the lower the more memory concat_df = extract_features(df_roll_cut.ffill(), column_id="id", column_sort="sort", n_jobs=season, show_warnings=False, disable_progressbar=True) if insample: concat_df = concat_df.dropna(axis=1, how="all") concat_df.index = target[df_roll_cut['id'].value_counts(). index].sort_index().to_frame().index concat_df = pd.merge(target[df_roll_cut['id'].value_counts(). index].sort_index().to_frame(), concat_df, left_index=True, right_index=True, how="left") concat_df_list = constant_feature_detect(data=concat_df, threshold=0.95) concat_df = concat_df.drop(concat_df_list, axis=1) else: forecast_out.index.name = "Date" concat_df.index = forecast_out.index concat_df = impute(concat_df) return concat_df print("LightGBM ensemble have been successfully built") concat_df_drop_in = tsfresh_run(forecast_in, season, insample=True) extracted_n_selected = select_features( concat_df_drop_in.drop("Target", axis=1), concat_df_drop_in["Target"], fdr_level=0.01, n_jobs=12) # fdr is the significance level. forecast_out_add = pd.concat((forecast_in.iloc[-season + 1:, :].drop( ["Target"], axis=1), forecast_out), axis=0) concat_df_drop_out = tsfresh_run(forecast_out_add, season, insample=False, forecast_out=forecast_out) extracted_n_selected_out = concat_df_drop_out[extracted_n_selected.columns] ## Reduce the dimensions of generated time series features pca2 = PCA(n_components=8) pca2.fit(extracted_n_selected) pca2_results_in = pca2.transform(extracted_n_selected) pca2_results_out = pca2.transform(extracted_n_selected_out) cols = 0 for i in range(pca2_results_in.shape[1]): cols = cols + 1 extracted_n_selected["pca_" + str(i)] = pca2_results_in[:, i] extracted_n_selected_out["pca_" + str(i)] = pca2_results_out[:, i] df = forecast_in.iloc[season - 1:, :].copy() df = time_feature(df, perd) df["mean"] = df.drop(["Target"], axis=1).mean(axis=1) df_new = pd.concat( (df.reset_index(), extracted_n_selected.iloc[:, -cols:].reset_index(drop=True)), axis=1) df_new = df_new.set_index("Date") forecast_train, forecast_test = tts(df_new, train_size=0.5, shuffle=False, stratify=None) target = "Target" d_train = lgb.Dataset(forecast_train.drop(columns=[target]), label=forecast_train[target]) #d_valid = lgb.Dataset(forecast_test.drop(columns=[target]), label=forecast_test[target]) params = { 'boosting_type': 'gbdt', 'objective': 'regression', 'metric': 'rmsle', 'max_depth': 6, 'learning_rate': 0.1, 'verbose': 0, 'num_threads': 16 } model = lgb.train(params, d_train, 100, verbose_eval=1) ensemble_ts = pd.DataFrame(index=forecast_test.index) ensemble_ts["ensemble_ts"] = model.predict( forecast_test.drop(columns=[target])) df_out = forecast_out.copy() df_out = time_feature(df_out, perd) df_out["mean"] = df_out.mean(axis=1) ensemble_ts_out = pd.DataFrame(index=df_out.index) ensemble_ts_out["ensemble_ts"] = model.predict(df_out) print("LightGBM ensemble have been successfully built") return ensemble_ts, ensemble_ts_out
SVMAccuracycuracy = 0 accuracy_score = 0 logAccuracy = 0 gnbAccuracy = 0 ncAccuracy = 0 myAccuracy = 0 iterations = 5 print(" Iterating cross validation : ", end="") for i in range(iterations): print(i) X_train, X_test, y_train, y_test = tts( data1, trainlabels, test_size=0.3) newRows = len(X_train) newCols = len(X_train[0]) newRowst = len(X_test) newColst = len(X_test[0]) newRowsL = len(y_train) PearFeatures = PearsonCorrtin(X_train, y_train, features) allFeatures.append(PearFeatures) argument = copy.deepcopy(PearFeatures) data_fea = dataCreation(argument, X_train)
RepeatedEditedNearestNeighbours) print(__doc__) # Generate the dataset X, y = make_classification(n_classes=2, class_sep=1.25, weights=[0.3, 0.7], n_informative=3, n_redundant=1, flip_y=0, n_features=5, n_clusters_per_class=1, n_samples=5000, random_state=10) # Instanciate a PCA object for the sake of easy visualisation pca = PCA(n_components=2) # Create the samplers enn = EditedNearestNeighbours() renn = RepeatedEditedNearestNeighbours() # Create the classifier knn = KNN(1) # Make the splits X_train, X_test, y_train, y_test = tts(X, y, random_state=42) # Add one transformers and two samplers in the pipeline object pipeline = make_pipeline(pca, enn, renn, knn) pipeline.fit(X_train, y_train) y_hat = pipeline.predict(X_test) print(classification_report(y_test, y_hat))
@author: KARIS """ #import data import pandas as pd df = pd.read_csv('Bahubali2_vs_Dangal.csv') #splitting dependent and independent values features= df.iloc[:,:1].values lab_bahu = df.iloc[: ,1:2].values lab_dang = df.iloc[:,2:3].values #splitting test and train from sklearn.model_selection import train_test_split as tts features_train, features_test, lab_bahu_train, lab_bahu_test,lab_dang_train , lab_dang_test = tts( features , lab_bahu, lab_dang , test_size = 0.2 ,random_state = 0) #fitting data on bahubali's train from sklearn.linear_model import LinearRegression reg_bahu = LinearRegression() reg_bahu.fit(features_train,lab_bahu_train) #fitting data on dangal's train reg_dang = LinearRegression() reg_dang.fit(features_train,lab_dang_train) #predicting income on 10th day lab_pred_bahu= reg_bahu.predict(10) lab_pred_dang= reg_dang.predict(10)
# data set separation from sklearn.model_selection import train_test_split as tts from decisionboundary import plot_decision_boundary as pdb from decisionboundary import marker_list_all, color_list_all xxor = np.random.randn(350,2) yxor = np.logical_xor(xxor[:,0]>0, xxor[:,1]>0) # visualize how they look ## class 1 plt.scatter (xxor[yxor==False, 0], xxor[yxor==False, 1], marker=marker_list_all[0], c=color_list_all[0], label='Class False') ## class 1 plt.scatter (xxor[yxor==True, 0], xxor[yxor==True, 1], marker=marker_list_all[1], c=color_list_all[1], label='Class True') plt.legend (loc = 0) plt.xlim (xxor[:,0].min(), xxor[:,0].max()) plt.ylim (xxor[:,1].min(), xxor[:,1].max()) plt.tight_layout () pic1 = 'scatter-show.pdf' plt.savefig (pic1) plt.show () # separating data set xtr, xte, ytr, yte = tts (xxor, yxor, test_size = 0.3) # The following is for classifying svc0 = SVC (C=100.0, kernel='rbf') svc0.fit (xtr, ytr) ypd = svc0.predict (xte) print ("accuracy: ", svc0.score (xte, yte)) pdb (xxor, yxor, classifier=svc0)
## class 1 for ind, val in enumerate (np.unique(y)): plt.scatter (x[y==val,0], x[y==val,1], marker = marker_list_all[ind], c = color_list_all2[ind], label='Class '+str(val)) plt.legend (loc = 0) plt.xlim (x[:,0].min(), x[:,0].max()) plt.ylim (x[:,1].min(), x[:,1].max()) plt.tight_layout () pic1 = 'random-forest-scat.pdf' plt.savefig (pic1) plt.show () # separating data set xtr, xte, ytr, yte = tts (x, y, test_size = 0.3, random_state=0) # standarizing the data sc0 = SC () sc0.fit (xtr) xtr_std = sc0.transform (xtr) xte_std = sc0.transform (xte) # The following is for classifying rfc = RFC(criterion="entropy", n_estimators=50, random_state=1) rfc.fit (xtr_std, ytr) ypd = rfc.predict (xte_std) print ("accuracy: ", rfc.score (xte_std, yte)) pdb (x, y, classifier=rfc, standardizer=sc0)
d = match_data.drop('winner', axis=1).values for i in d: home = i[1] away = i[0] great_match_arrays.append(np.hstack([i, team_standings.loc[(team_standings['team.ID'] == home)].values.flatten(), team_standings.loc[(team_standings['team.ID'] == away)].values.flatten()])) final_data = np.stack(great_match_arrays) print(final_data.shape) target = match_data['winner'].values pd.DataFrame(target).to_csv('target.csv') target = match_data['winner'].values X_train, X_test, y_train, y_test = tts(final_data, target, train_size=0.75, test_size=0.25, random_state=42) print(X_train.shape, X_test.shape, y_train.shape, y_test.shape) # NOTE: Make sure that the class is labeled 'target' in the data file # Score on the training set was:0.6928853754940711 exported_pipeline = make_pipeline( StackingEstimator(estimator=RandomForestClassifier(bootstrap=True, criterion="gini", max_features=0.5, min_samples_leaf=20, min_samples_split=5, n_estimators=100)), DecisionTreeClassifier(criterion="gini", max_depth=6, min_samples_leaf=16, min_samples_split=15) ) exported_pipeline.fit(X_train, y_train) results = exported_pipeline.predict(final_data) def predict_game(away, home):
def load_data(filepath): data = pd.read_csv(filepath) labels = data.iloc[:, 0] data = data.iloc[:, 1:] cat_cols_idx = sorted([data.columns.get_loc(c) for c in categorical_cols]) d_train, d_test, y_train, y_test = tts(data, labels, test_size=0.3, random_state=42) test_idx = list(d_test.index.values) print('Generating oversampled datasets...') # SMOTE-NC Before d_train_b, y_train_b = SMOTENC( categorical_features=cat_cols_idx, k_neighbors=5, random_state=42).fit_resample(data, labels) d_train_b = np.delete(d_train_b, test_idx, axis=0) y_train_b = np.delete(y_train_b, test_idx, axis=0) d_test_b = deepcopy(d_test) # SMOTE-NC After d_train_a, y_train_a = SMOTENC( categorical_features=cat_cols_idx, k_neighbors=5, random_state=42).fit_resample(d_train, y_train) d_test_a = deepcopy(d_test) # Scale numeric features only print('Scaling numeric features...') scaler = StandardScaler() for i in range(d_train.shape[1]): col = data.columns[i] if col in categorical_cols: continue # Original d_train[[col]] = scaler.fit_transform(d_train[[col]]) d_test[[col]] = scaler.transform(d_test[[col]]) # SMOTE-NC Before d_train_b[:, i] = np.ravel(scaler.fit_transform( d_train_b[:, i].reshape(-1, 1))) d_test_b[[col]] = scaler.transform(d_test_b[[col]]) # SMOTE-NC After d_train_a[:, i] = np.ravel(scaler.fit_transform( d_train_a[:, i].reshape(-1, 1))) d_test_a[[col]] = scaler.transform(d_test_a[[col]]) # Original train_ldr = td.DataLoader(utils.ClockDrawingDataset(d_train, y_train), batch_size=10, shuffle=True, num_workers=0) test_ldr = td.DataLoader(utils.ClockDrawingDataset(d_test, y_test), batch_size=10, shuffle=False, num_workers=0) # SMOTE-NC Before train_ldr_b = td.DataLoader(utils.ClockDrawingDataset(d_train_b, y_train_b), batch_size=10, shuffle=True, num_workers=0) test_ldr_b = td.DataLoader(utils.ClockDrawingDataset(d_test_b, y_test), batch_size=10, shuffle=False, num_workers=0) # SMOTE-NC After train_ldr_a = td.DataLoader(utils.ClockDrawingDataset(d_train_a, y_train_a), batch_size=10, shuffle=True, num_workers=0) test_ldr_a = td.DataLoader(utils.ClockDrawingDataset(d_test_a, y_test), batch_size=10, shuffle=False, num_workers=0) return [train_ldr, train_ldr_b, train_ldr_a], \ [test_ldr, test_ldr_b, test_ldr_a]
from sklearn.datasets import load_digits, load_iris from sklearn.linear_model import LogisticRegression from sklearn.model_selection import train_test_split as tts from yellowbrick.classifier import ConfusionMatrix if __name__ == '__main__': digits = load_digits() digit_X = digits.data digit_y = digits.target d_X_train, d_X_test, d_y_train, d_y_test = tts( digit_X, digit_y, test_size=0.2 ) model = LogisticRegression() digit_cm = ConfusionMatrix(model, classes=[0,1,2,3,4,5,6,7,8,9]) digit_cm.fit(d_X_train, d_y_train) digit_cm.score(d_X_test, d_y_test) d = digit_cm.poof(outpath="images/confusion_matrix_digits.png") iris = load_iris() iris_X = iris.data iris_y = iris.target iris_classes = iris.target_names i_X_train, i_X_test, i_y_train, i_y_test = tts( iris_X, iris_y, test_size=0.2 ) model = LogisticRegression() iris_cm = ConfusionMatrix( model, classes=iris_classes,
def build_and_evaluateSVM(X, y, n=None, classifier=svm.SVC, outpath=None, verbose=True): """ Builds a classifer for the given list of documents and targets in two stages: the first does a train/test split and prints a classifier report, the second rebuilds the model on the entire corpus and returns it for operationalization. X: a list or iterable of raw strings, each representing a document. y: a list or iterable of labels, which will be label encoded. Can specify the classifier to build with: if a class is specified then this will build the model with the Scikit-Learn defaults, if an instance is given, then it will be used directly in the build pipeline. If outpath is given, this function will write the model as a pickle. If verbose, this function will print out information to the command line. """ @timeit def build(classifier, X, y=None): """ Inner build function that builds a single model. """ if isinstance(classifier, type): classifier = classifier(kernel='rbf') gridsearch_pipe = Pipeline([ # ('preprocessor', TextNormalizer_lemmatize()), ('vectorizer', TfidfVectorizer(tokenizer=identity, preprocessor=None, lowercase=False, ngram_range=(1, 2))), ('classifier', classifier), ]) # maxdf = [0.85, 0.90, 0.95] # mindf = (4, 3, 2) # nfeat = [12000, 12500, 13000] # ngrams = [(1, 1), (1, 2), (1,3)] # # Cs = [0.001, 0.01, 0.1, 1, 10] # # gammas = [0.001, 0.01, 0.1, 1] # param_grid = { # # 'classifier__C': Cs, 'classifier__gamma' : gammas, # 'vectorizer__max_df':maxdf, 'vectorizer__min_df':mindf, 'vectorizer__ngram_range':ngrams, 'vectorizer__max_features':nfeat # } # grid_search = GridSearchCV(gridsearch_pipe, param_grid, cv=10) # grid_search.fit(X, y) # best_param = grid_search.best_params_ # print(best_param) # vectorizer = TfidfVectorizer(tokenizer=identity, preprocessor=None, lowercase=False, # max_df=best_param['vectorizer__max_df'], min_df=best_param['vectorizer__min_df'], # ngram_range=best_param['vectorizer__ngram_range'], max_features=best_param['vectorizer__max_features']) # classifier = svm.SVC(kernel='rbf', C=best_param['classifier__C'], gamma=best_param['classifier__gamma']) vectorizer = TfidfVectorizer(tokenizer=identity, preprocessor=None, lowercase=False, ngram_range=(1, 2), max_features=12000, max_df=0.85, min_df=4) classifier = svm.SVC(kernel='rbf', C=10, gamma=1) model = Pipeline([ # ('preprocessor', TextNormalizer_lemmatize()), ('vectorizer', vectorizer), ('classifier', classifier), ]) model.fit(X, y) return model # Label encode the targets labels = LabelEncoder() y = labels.fit_transform(y) # Begin evaluation if n: if verbose: print("splitting test and test set by: " + str(n)) n_samples = len(y) indicies = np.arange(n_samples) X_train, X_test, y_train, y_test, idx_train, idx_test = tts( X, y, indicies, test_size=n, stratify=y) # X_train, X_test, y_train, y_test = X[:n], X[n:], y[:n], y[n:] print(len(X_train), len(X_test)) from collections import Counter print(Counter(y_train)) model, secs = build(classifier, X_train, y_train) model.labels_ = labels if verbose: print("Evaluation model fit in {:0.3f} seconds".format(secs)) y_pred = model.predict(X_test) if verbose: print("Classification Report:\n") print(clsr(y_test, y_pred, target_names=labels.classes_)) print(cm(y_test, y_pred)) print('acc', accuracy_score(y_test, y_pred)) print('f1', f1_score(y_test, y_pred, average='weighted')) else: if verbose: print("Building for evaluation with full set") model, secs = build(classifier, X, y) model.labels_ = labels if verbose: print("Evaluation model fit in {:0.3f} seconds".format(secs)) y_pred = model.predict(X) if verbose: print("Classification Report:\n") print(clsr(y, y_pred, target_names=labels.classes_)) print(cm(y, y_pred)) print(accuracy_score(y, y_pred)) if verbose: print("Evaluation of naive prediction ...") y_naive = [0] * len(y_test) print(type(y_test)) print('acc naive', accuracy_score(y_test, y_naive)) if verbose: print("Complete model fit in {:0.3f} seconds".format(secs)) if outpath: with open(outpath, 'wb') as f: pickle.dump(model, f) print("Model written out to {}".format(outpath)) return model, y_pred, idx_test
tfv = TfidfVectorizer(min_df=1, stop_words='english') data = pd.read_csv('BankFAQs.csv') questions = data['Question'].values X = [] for question in questions: X.append(cleanup(question)) tfv.fit(X) le.fit(data['Class']) X = tfv.transform(X) y = le.transform(data['Class']) trainx, testx, trainy, testy = tts(X, y, test_size=.25, random_state=42) model = SVC(kernel='linear') model.fit(trainx, trainy) print("SVC:", model.score(testx, testy)) def get_max5(arr): ixarr = [] for ix, el in enumerate(arr): ixarr.append((el, ix)) ixarr.sort() ixs = [] for i in ixarr[-5:]: ixs.append(i[1])
# 合并数据 if i_month == para.month_in_sample[0]: # 第一个月 data_in_sample = data_curr_month # 定义样本空间数据框并从第一个月开始填入数据 else: data_in_sample = data_in_sample.append(data_curr_month) # 除了第一个月以外,都从最右列后面开始填入数据 #%% 数据预处理 # 将样本内集合切分成训练集和交叉验证集,并通过主成分分析进行降维以及去除因子共线性。 # 取样本空间 X_in_sample = data_in_sample.loc[:, 'EP':'bias'] # 切片:所有行,70个因子所有列 (##列重名怎么办?) Y_in_sample = data_in_sample.loc[:, 'return_bin'] # 切片:所有行,labal 列 # 将样本空间随机切分为训练集和交叉验证集 X_train, X_cv, Y_train, y_cv = tts(X_in_sample, Y_in_sample, test_size=para.percent_cv, random_state=para.seed) # PCA pca = decomposition.PCA(n_components=0.95) # n_components 为 0~1间浮点数表示,PCA模型取该比例主成分数量;为大于1整数时表示取前几个主成分 pca.fit(X_train) # 对训练集进行主成分分析拟合 X_train = pca.transform(X_train) # 根据训练好的 pca 模型,对训练集进行主成分分析转换 X_cv = pca.transform(X_cv) # 根据训练好的 pca 模型,对交叉验证集进行主成分分析转换 # 数据标准化 scaler = preprocessing.StandardScaler().fit(X_train) X_train = scaler.transform(X_train) X_cv = scaler.transform(X_cv) #%% 核心模型设置 if para.method == 'SVM':
from build import models, reader from build import labels as categories from sklearn.model_selection import train_test_split as tts from sklearn.metrics import classification_report docs = reader.fileids(categories=categories) labels = [reader.categories(fileids=[fid])[0] for fid in docs] train_docs, test_docs, train_labels, test_labels = tts(docs, labels, test_size=0.2) def get_docs(fids): for fid in fids: yield list(reader.docs(fileids=[fid])) sgd = models[3] nby = models[4] sgd.fit(get_docs(train_docs), train_labels) y_pred = sgd.predict(get_docs(test_docs)) print(classification_report(test_labels, y_pred, labels=categories)) import nltk def preprocess(text): return [ [ list(nltk.pos_tag(nltk.word_tokenize(sent))) for sent in nltk.sent_tokenize(para)
n_users = len(ratings_df_sample['userId'].unique()) n_movies = len(ratings_df_sample['movieId'].unique()) (n_users, n_movies) movie_ids = ratings_df_sample['movieId'].unique() def scale_movie_id(movie_id): scaled = np.where(movie_ids == movie_id)[0][0] + 1 return scaled ratings_df_sample['movieId'] = ratings_df_sample['movieId'].apply(scale_movie_id) ratings_df_sample.head() train_data, test_data = tts(ratings_df_sample, test_size=0.2) print('Train shape: {}'.format(train_data.shape)) print('Test shape: {}'.format(test_data.shape)) def rmse(prediction, ground_truth): prediction = np.nan_to_num(prediction)[ground_truth.nonzero()].flatten() ground_truth = np.nan_to_num(ground_truth)[ground_truth.nonzero()].flatten() mse = mean_squared_error(prediction, ground_truth) return sqrt(mse) train_data_matrix = np.zeros((n_users, n_movies)) for line in train_data.itertuples(): train_data_matrix[line[1] - 1, line[2] - 1] = line[3]
for entry in dic: data.append(palvrs.count(entry[0])) feature_set.append(data) if "ham" in email: labels.append(0) if "spam" in email: labels.append(1) c = c - 1 return feature_set, labels d = construir_dicionario() features, labels = criar_dataset(d) print(" Dividindo nosso dataset entre treino e teste, 80% treino, 20% teste,") print(" utilizando as features e labels de treino e de teste...\n") x_train, x_test, y_train, y_test = tts(features, labels, test_size=0.2) print( " Criando o modelo de Machine Learning com base no dataset de treino...\n") clf = MultinomialNB() clf.fit(x_train, y_train) print(" Realizando predicao para testar o modelo que acabamos de criar...\n") pred = clf.predict(x_test) print(" Pontuacao de acuracia atingida pelo nosso modelo: ", end='') print(accuracy_score(y_test, pred)) print("\n Salvando modelo em forma de arquivo para uso posterior.\n") salvar_modelo(clf, "text-classifier.mdl")
#print("this is labels \n",labels) return feature_set, labels # In[5]: d = make_dict() features, labels = make_dataset(d) # In[6]: #import model_selection to split the datasets into train and test sets from sklearn.model_selection import train_test_split as tts x_train, x_test, y_train, y_test = tts( features, labels, test_size=0.3) # 80% data is used to train and 20 % for testing # In[10]: # training the model with naive_bayes classifier from sklearn.metrics import confusion_matrix from sklearn.naive_bayes import MultinomialNB clf1 = MultinomialNB() clf1.fit(x_train, y_train) # Predicting the accuracy of the naive_bayes classifier from sklearn.metrics import accuracy_score predict1 = clf1.predict(x_test) print(accuracy_score(y_test, predict1)) print(" Confusion matrix ", confusion_matrix(y_test, predict1))
def main(): if len(sys.argv)!=4: print 'USAGE:' print 'python -m scoop devel.py [cloneID] [clusterDir] [outputDir]' print 'see devel_config.py' return cloneID = sys.argv[1] clusterDir = sys.argv[2]; assert clusterDir[-1]=='/',"should be ended with '/'" baseOutDir = sys.argv[3]; assert baseOutDir[-1]!='/',"should NOT be ended with '/'" clfParam = None method = cfg['method'] if method=='esvm': from esvm_config import config as clfParam elif method=='psvm': from psvm_config import config as clfParam else: print 'FATAL: unknown method' return outDir = os.path.join(baseOutDir,'devel-'+os.path.basename(baseOutDir)) if not(os.path.isdir(baseOutDir)): os.makedirs(baseOutDir) if not(os.path.isdir(outDir)): os.makedirs(outDir) ## Load data ################################################################################### dataLog = {}; dataLogFpath = os.path.join(outDir,'data_log_'+os.path.basename(baseOutDir)+'.json') dataset = clusterDir.split('/')[-2].split('-')[-1]; dataLog['dataset'] = dataset datasetParams = dataset.split('#') assert datasetParams[0]=='yamanishi' xyDevFpath = os.path.join(baseOutDir,'_'.join(['xdev','ydev','xrel','yrel']+datasetParams)+'.h5') if os.path.exists(xyDevFpath): print 'loading data from PREVIOUS...' with h5py.File(xyDevFpath,'r') as f: xdev = f['xdev'][:] ydev = f['ydev'][:] xrel = f['xrel'][:] yrel = f['yrel'][:] xrelraw = f['xrelraw'][:] with open(dataLogFpath,'r') as f: dataLog = yaml.load(f) else: print 'loading data FRESHLY...' print 'loading cluster result...' nUnlabels = [] statFnames = [i for i in os.listdir(clusterDir) if 'labels_stat.json' in i] for i in statFnames: with open(os.path.join(clusterDir,i),'r') as f: stat = yaml.load(f) nUnlabels.append(stat['0']) # use the cluster with minimum numbers of unlabeled samples metric = '_'.join(statFnames[ nUnlabels.index(min(nUnlabels)) ].split('_')[0:2]) dataLog['metric'] = metric connFpath = os.path.join(clusterDir,metric+'_labels.pkl') with open(connFpath,'r') as f: data = pickle.load(f) ## print 'getting devel and release data...' xraw = []; yraw = [] for k,v in data.iteritems(): for vv in v: xraw.append(vv) yraw.append(k) devIdx = [i for i in range(len(xraw)) if yraw[i]!=0] xdev = [xraw[i] for i in devIdx] ydev = [yraw[i] for i in devIdx] relIdx = [i for i in range(len(xraw)) if yraw[i]==0] xrel = [xraw[i] for i in relIdx] yrel = [yraw[i] for i in relIdx] dataLog['nDevel'] = len(devIdx); dataLog['nData'] = len(yraw) dataLog['rDevel:Data'] = dataLog['nDevel']/float(dataLog['nData']) dataLog['nDevel(+)'] = len( [i for i in ydev if i==1] ); assert dataLog['nDevel(+)']!=0 dataLog['nDevel(-)'] = len( [i for i in ydev if i==-1] ); assert dataLog['nDevel(-)']!=0 dataLog['rDevel(+):Devel'] = float(dataLog['nDevel(+)'])/dataLog['nDevel'] dataLog['rDevel(-):Devel'] = float(dataLog['nDevel(-)'])/dataLog['nDevel'] dataLog['rDevel(+):(-)'] = float(dataLog['nDevel(+)'])/float(dataLog['nDevel(-)']) dataLog['nRelease'] = len(relIdx); dataLog['rRelease:Data'] = dataLog['nRelease']/float(dataLog['nData']) ## print 'loading com, pro feature...' krFpath = os.path.join(cfg['datasetDir'],datasetParams[0],'feature', 'klekotaroth','klekotaroth-'+datasetParams[1]+'.h5') aacFpath = os.path.join(cfg['datasetDir'],datasetParams[0],'feature', 'amino-acid-composition','amino-acid-composition-'+datasetParams[1]+'.h5') krDict = {}; aacDict = {} with h5py.File(krFpath, 'r') as f: for com in [str(i) for i in f.keys()]: krDict[com] = f[com][:] with h5py.File(aacFpath, 'r') as f: for pro in [str(i) for i in f.keys()]: aacDict[pro] = f[pro][:] # aacDict[pro] = list( fu.map(lambda x: float('%.2f'%(x)),f[pro][:]) ) # rounding comFeaLenOri = len(krDict.values()[0]) proFeaLenOri = len(aacDict.values()[0]) ## msg = 'extract (com,pro) feature... dims: '+str(comFeaLenOri)+','+str(proFeaLenOri) msg += ' of '+str(len(ydev))+' and '+str(len(yrel)) print msg sh.setConst(krDict=krDict) sh.setConst(aacDict=aacDict) xdevf = list( fu.map(cutil.extractComProFea,xdev) ) xrelf = list( fu.map(cutil.extractComProFea,xrel) ) ## xyDevList = cutil.divideSamples(xdevf,ydev,cfg['smoteBatchSize']) if cfg['maxNumberOfSmoteBatch'] != 0: xyDevList = xyDevList[0:cfg['maxNumberOfSmoteBatch']] smoteSeed = util.seed(); dataLog['smoteSeed'] = smoteSeed sh.setConst(smoteSeed=smoteSeed) print 'resampling via Smote FRESHLY... '+str(len(xyDevList))+' smote(s)'+' on '+str(len(ydev)) smoteTic = time.time() xdevfr = []; ydevr = [] xydevfrList = list( fu.map(ensembleSmote,xyDevList) ) for xdevfri,ydevri in xydevfrList: for x in xdevfri: xdevfr.append(x.tolist()) for y in ydevri: ydevr.append(y) assert len(xdevfr)==len(ydevr),'len(xdevfr)!=len(ydevr)' dataLog['nSmote'] = len(xyDevList) dataLog['nDevelResampled'] = len(ydevr) dataLog['rDevelResampled:Data'] = dataLog['nDevelResampled']/float(dataLog['nData']) dataLog['nDevelResampled(+)'] = len( [i for i in ydevr if i==1] ) dataLog['nDevelResampled(-)'] = len( [i for i in ydevr if i==-1] ) dataLog['rDevelResampled(+):DevelResampled'] = dataLog['nDevelResampled(+)']/float(dataLog['nDevelResampled']) dataLog['rDevelResampled(-):DevelResampled'] = dataLog['nDevelResampled(-)']/float(dataLog['nDevelResampled']) dataLog['rDevelResampled(+):(-)'] = dataLog['nDevelResampled(+)']/float(dataLog['nDevelResampled(-)']) dataLog['timeSMOTE'] = str(time.time()-smoteTic) ## print 'update xdev,ydev,xrel... '+str(np.asarray(xdevfr).shape) xrelraw = xrel[:] # raw: feature is NOT extracted xrel = xrelf[:] xdev = xdevfr[:] ydev = ydevr[:] print 'writing updated xdev,ydev and xrel,yrel...' with h5py.File(xyDevFpath,'w') as f: f.create_dataset('xdev',data=xdev,dtype=np.float32) f.create_dataset('ydev',data=ydev,dtype=np.int8) f.create_dataset('xrel',data=xrel,dtype=np.float32) f.create_dataset('yrel',data=yrel,dtype=np.int8) f.create_dataset('xrelraw',data=xrelraw) print 'writing dataLog...' dataLog['nCom'] = len(krDict) dataLog['nPro'] = len(aacDict) with open(dataLogFpath,'w') as f: json.dump(dataLog,f,indent=2,sort_keys=True) ## TUNE+TRAIN+TEST ############################################################################# devLog = {} devSeed = util.seed(); dataLog['devSeed'] = devSeed tag = '_'.join([method+'#'+cloneID,dataset,util.tag()]) ## split devel dataset msg = ' '.join( ['devel',dataset,cloneID]) xtr,xte,ytr,yte = tts(xdev,ydev,test_size=cfg['testSize'], random_state=devSeed,stratify=ydev) if cfg['maxTestingSamples']>0: chosenIdx = np.random.randint(len(xte),size=cfg['maxTestingSamples']) xte = [xte[i] for i in chosenIdx]; yte = [yte[i] for i in chosenIdx] devLog['nTraining'] = len(xtr) devLog['nTraining(+)'] = len([i for i in ytr if i==1]) devLog['nTraining(-)'] = len([i for i in ytr if i==-1]) devLog['rTraining(+):(-)'] = devLog['nTraining(+)']/float(devLog['nTraining(-)']) devLog['rTraining:Devel'] = devLog['nTraining']/float(dataLog['nDevelResampled']) devLog['nTesting'] = len(xte) devLog['nTesting(+)'] = len([i for i in yte if i==1]) devLog['nTesting(-)'] = len([i for i in yte if i==-1]) devLog['rTesting(+):(-)'] = devLog['nTesting(+)']/float(devLog['nTesting(-)']) devLog['rTesting:Devel'] = devLog['nTesting']/float(dataLog['nDevelResampled']) ## tuning clf = None if method=='esvm': clf = eSVM(simMat=None) elif method=='psvm': clf = svm.SVC(kernel=clfParam['kernel'],probability=True) ## training print msg+': fitting nTr= '+str(len(ytr)) trTic = time.time() if method=='esvm': clf.fit(xtr,ytr) devLog['labels'] = clf.labels() devLog['nSVM'] = clf.nSVM() devLog['xtrDimAllBatches'] = clf.xtrDimAllBatches() elif method=='psvm': if cfg['method']['kernel']=='precomputed': assert False # simMatTr = cutil.makeComProKernelMatFromSimMat(xtr,xtr,simMat) # clf.fit(simMatTr,ytr) else: clf.fit(xtr,ytr) devLog['labels'] = clf.classes_.tolist() devLog['timeTraining'] = str(time.time()-trTic) ## testing print msg+': predicting nTe= '+str(len(yte)) teTic = time.time() if method=='esvm': ypred,yscore = clf.predict(xte) elif method=='psvm': if cfg['method']['kernel']=='precomputed': assert False # simMatTe = cutil.makeComProKernelMatFromSimMat(xte,xtr,simMat) # ypred = clf.predict(simMatTe) # yscore = clf.predict_proba(simMatTe) else: ypred = clf.predict(xte) yscore = clf.predict_proba(xte) yscore = [max(i.tolist()) for i in yscore] devLog['timeTesting'] = str(time.time()-teTic) ## TEST RELEASE ################################################################################ print msg+': predicting RELEASE n= '+str(len(yrel)) relTic = time.time() if method=='esvm': yrel,yrelscore = clf.predict(xrel) elif method=='psvm': if cfg['method']['kernel']=='precomputed': assert False # simMatTe = cutil.makeComProKernelMatFromSimMat(xrel,xtr,simMat) # yrel = clf.predict(simMatTe) # yrelscore = clf.predict_proba(simMatTe) else: yrel = clf.predict(xrel) yrelscore = clf.predict_proba(xrel) yrelscore = [max(i.tolist()) for i in yrelscore] devLog['timeRelease'] = str(time.time()-relTic) ## WRITE RESULT ################################################################################ result = {'yte':yte,'ypred':ypred,'yscore':yscore, 'xrelraw':xrelraw,'yrel':yrel,'yrelscore':yrelscore} print 'writing prediction...' with h5py.File(os.path.join(outDir,'result_'+tag+'.h5'),'w') as f: for k,v in result.iteritems(): if 'raw' in k: f.create_dataset(k,data=v) else: dt = np.int8 if 'score' in k: dt = np.float32 f.create_dataset(k,data=v,dtype=dt) ## print 'writing devLog...' devLog['clfParam'] = clfParam devLog['devParam'] = cfg with open(os.path.join(outDir,'devLog_'+tag+'.json'),'w') as f: json.dump(devLog,f,indent=2,sort_keys=True)
import numpy as np import pandas as pd import matplotlib.pyplot as plt from sklearn.model_selection import train_test_split as tts from sklearn.preprocessing import StandardScaler from sklearn.linear_model import LogisticRegression from sklearn.metrics import log_loss california_housing_dataframe = pd.read_csv("https://download.mlcc.google.com/mledu-datasets/california_housing_train.csv", sep=",") X = california_housing_dataframe[ ["latitude", "longitude", "housing_median_age", "total_rooms", "total_bedrooms", "population", "households", "median_income"]] y=california_housing_dataframe["median_house_value"] X.describe() X_train,X_test,y_train,y_test= tts(X,y,random_state=0,test_size=0.2) sc= StandardScaler() X_train= sc.fit_transform(X_train) X_test= sc.transform(X_test) classifier= LogisticRegression(random_state=0) classifier.fit(X_train,y_train) y_pred= classifier.predict(X_test) cm= log_loss(y_test,y_pred) print(cm)