def main(infile): data = pd.read_csv(infile) X = np.array([data['R'],data['G'],data['B']]).transpose().reshape((-1,3)) # array with shape (n, 3). Divide by 255 so components are all 0-1. X = X/255 y = np.array([data['Label']]).transpose() # array with shape (n,) of colour words. # TODO: build model_rgb to predict y from X. # TODO: print model_rgb's accuracy score # TODO: build model_lab to predict y from X by converting to LAB colour first. # TODO: print model_lab's accuracy score X_train, X_test, y_train, y_test = train_test_split( X, y) model_rgb = GaussianNB() model_rgb.fit(X_train, y_train) print(model_rgb.score(X_test, y_test)) plot_predictions(model_rgb) plt.savefig('predictions_rgb.png') def covert_rgb(X): X = X.reshape(1,-1,3) rgb2lab(X) X = X.reshape(-1,3) return X model_lab = make_pipeline( FunctionTransformer(covert_rgb), GaussianNB() ) model_lab.fit(X_train, y_train) print(model_lab.score(X_test, y_test)) plot_predictions(model_lab) plt.savefig('predictions_lab.png')
def main(): iris = load_iris X = iris.data[:100, [0, 2]] y = iris.target[:100] y = np.where(y == 1, 1, -1) X_train, X_test, y_train, y_test = \ train_test_split(X, y, test_size = 0.3) ppn = PerceptronBase(eta = 0.1, n_iter = 10) ppn.fit(X_train, y_train)
def max_pool_2d(x, poolsize=(2,2)): pooled_out = downsample.max_pool_2d( input = x, ds = poolsize, ignore_border = True ) return pooled_out def embed_id(sentences=None, n_vocab=None, k_wrd=None): if sentences is None or n_vocab is None or k_wrd is None: return NotImplementedError() tmp = sentences.get_value(borrow=True) max_sent_len = len(tmp[0]) x_wrd = [] for sentence in tmp: word_mat = np.array([[0]*n_vocab]*(max_sent_len+k_wrd-1), dtype='int8') i = 0 for word in sentence: word_mat[(k_wrd/2)+i][word] = 1 i += 1 x_wrd.append(word_mat) return theano.shared(x_wrd, borrow=False) class Result(object): def __init__(self, x, y): self.x = x self.y = y def negative_log_likelihood(self): self.prob_of_y_given_x = T.nnet.softmax(self.x) return -T.mean(T.log(self.prob_of_y_given_x)[T.arange(self.y.shape[0]), self.y]) def cross_entropy(self): self.prob_of_y_given_x = T.nnet.softmax(self.x) return T.mean(T.nnet.categorical_crossentropy(self.prob_of_y_given_x, self.y)) def mean_squared_error(self): return T.mean((self.x - self.y) ** 2) def errors(self): if self.y.ndim != self.y_pred.ndim: raise TypeError('y should have the same shape as self.y_pred', ('y', self.y.type, 'y_pred', self.y_pred.type)) if self.y.dtype.startswith('int'): self.prob_of_y_given_x = T.nnet.softmax(self.x) self.y_pred = T.argmax(self.prob_of_y_given_x, axis=1) return T.mean(T.neq(self.y_pred, self.y)) else: return NotImplementedError() def accuracy(self): if self.y.dtype.startswith('int'): self.prob_of_y_given_x = T.nnet.softmax(self.x) self.y_pred = T.argmax(self.prob_of_y_given_x, axis=1) return T.mean(T.eq(self.y_pred, self.y)) else: return NotImplementedError() def load_data(random_state=0): print 'fetch MNIST dataset' mnist = fetch_mldata('MNIST original') mnist.data = mnist.data.astype(np.float32) mnist.data /= 255 mnist.target = mnist.target.astype(np.int32) data_train,\ data_test,\ target_train,\ target_test\ = train_test_split(mnist.data, mnist.target, random_state=random_state) def shared_data(x,y): shared_x = theano.shared(np.asarray(x, dtype=theano.config.floatX), borrow=True) shared_y = theano.shared(np.asarray(y, dtype=theano.config.floatX), borrow=True) return shared_x, T.cast(shared_y, 'int32') data_train, target_train = shared_data(data_train, target_train) data_test, target_test = shared_data(data_test, target_test) return ([data_train, data_test], [target_train, target_test]) def load_livedoor_news_corpus(random_state=0, test_size=0.1): import six.moves.cPickle as pickle data_, target_ = pickle.load(open('dataset', 'rb')) data_train,\ data_test,\ target_train,\ target_test\ = train_test_split(data_, target_, random_state=random_state, test_size=test_size) def shared_data(x,y): shared_x = theano.shared(np.asarray(x, dtype=theano.config.floatX), borrow=True) shared_y = theano.shared(np.asarray(y, dtype=theano.config.floatX), borrow=True) return shared_x, T.cast(shared_y, 'int32') data_train, target_train = shared_data(data_train, target_train) data_test, target_test = shared_data(data_test, target_test) return ([data_train, data_test], [target_train, target_test]) def build_shared_zeros(shape, name): """ Builds a theano shared variable filled with a zeros numpy array """ return theano.shared( value=np.zeros(shape, dtype=theano.config.floatX), name=name, borrow=True )
import numpy as np import cv2 import glob import sklearn.cross_validation import train_test_split ## Loads the labels labels = pd.read_table("/data/dr/trainLabels.csv",sep=",",index_col=["image"]) labels.drop("420_right") filelist = glob.glob("/data/dr/data/sample_270_270/*.jpeg") #Stratified sampling for dividing data into X_train and X_valid x_train,x_valid,y_train, y_valid = train_test_split(filelist,labels, test_size = 0.10, random_state = 20) ## train_test split x_train,x_test,y_train, y_test = train_test_split(x_train,y_train, test_size = 0.10, random_state = 20) #read the x_valid and y_valid images x_valid = np.array([cv2.imread("/data/dr/data/sample_270_270/"+i+".jpeg") for i in y_valid.index]) y_valid = np.array(pd.get_dummies(y_valid["level"])) model_train = model((270,270,3)) """
클래스를 이용해 함수(처리부분)와 변수(데이터 부분)를 하나로 묶어 객체(인스턴스)를 생성해 사용한다는 점이다. 객체: 실제 존재하는 모든 사물 또는 개념 클래스: 객체를 정의해 놓은 것 인스턴스: 객체와 비슷. 클래스로부터 객체를 만드는 과정을 '클래스의 인스턴스화'라고 부름 객체 - 핸드폰 클래스 - 핸드폰 설계도 인스턴스화 - 핸드폰 설계도로부터 핸드폰을 만드는 과정 from scklearn import datasets import train_test_split X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size=0.25, random_state=321) parameter array: 분할시킬 데이터를 입력 test_size: 테스트 데이터셋의 비율이나 갯수 trian_size: 학습 데이터셋의 비율이나 갯수 random_state: 데이터 분할시 셔플이 이루어지는데 이를 위한 시드값 shuffle:셔플여부 설정 from sklearn.feature_selection import SelectKBest, f_classif SelectKBest: k highest점수에 따라 특징을 선택한다. parameter score_func: callable k: int or "all",optional, default=10 머신러닝 라이브러리
# coding: utf-8 import pandas data=pandas.read_csv('housing_oslo.csv') data.head() X=data.drop('Price',1) X.head(2) Y=data['Price'] Y.head(2) import sklearn.model_selection import train_test_split from sklearn.model_selection import train_test_split X_train,X_test, y_train, y_test = train_test_split(X,Y,random_state=0) X_train.head(2) from sklearn.linear_model import LinearRegression regression=LinearRegression() regression.fit(X_train,y_train) import numpy new_data_item=numpy.array([[168,202,4,3,4,2014,0,171,1,1]]) regression.predict(new_data_item)
import pandas as pd import sklearn.model_selection import train_test_split import sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier import sklearn import metrics import joblib ## load dataset.csv dataframe = pd.read_csv("csv/dataset.csv") print(dataframe.head()) #Split into training and test data data_x = dataframe.drop(["Label"], axis=1) data_y = dataframe["Label"] trained_x, test_x, trained_y, test_y = train_test_split(data_x, data_y, test_size=0.2, random_state-4) ## Build the model model = RandomForestClassifier(num_estimators=100, max_depth=5) model.fit(trained_x,trained_y) joblib.dump(model, "rf_malaria_100_5") # Create and build predictions and get classification report on the trained data predictions = model.predict(test_x) print(metrics.classification_report(predictions, test_y ))
In [3]: # remove duplicated columns^M ...: remove = [] ...: cols = train.columns ...: for i in range(len(cols)-1): ...: v = train[cols[i]].values ...: for j in range(i+1,len(cols)): ...: if np.array_equal(v,train[cols[j]].values): ...: remove.append(cols[j]) ...: train.drop(remove, axis=1, inplace=True) ...: test.drop(remove, axis=1, inplace=True) ...: # split data into train and test^M ...: test_id = test.ID ...: test = test.drop(["ID"],axis=1) ...: X = train.drop(["TARGET","ID"],axis=1) ...: y = train.TARGET.values ...: X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=1729) ...: print(X_train.shape, X_test.shape, test.shape) ...: ## # Feature selection ...: clf = ExtraTreesClassifier(random_state=1729) ...: selector = clf.fit(X_train, y_train) In [6]: feat_imp = pd.Series(clf.feature_importances_, index = X_train.columns.values).sort_values(ascending=False) ...: feat_imp[:40].plot(kind='bar', title='Feature Importances according to ExtraTreesClassifier', figsize=(12, 8)) ...: plt.ylabel('Feature Importance Score') ...: plt.subplots_adjust(bottom=0.3) ...: plt.savefig('1.png') ...: plt.show() ...: # clf.feature_importances_ ...: fs = SelectFromModel(selector, prefit=True) ...: X_train = fs.transform(X_train) ...: X_test = fs.transform(X_test) ...: test = fs.transform(test)
# Importing the libraries import numpy as np import matplotlib.pyplot as plt import pandas as pd import train_test_split # Importing the dataset dataset = pd.read_csv('Social_Network_Ads.csv') X = dataset.iloc[:, [2, 3]].values y = dataset.iloc[:, 4].values # Splitting the dataset into the Training set and Test set from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0) # Feature Scaling from sklearn.preprocessing import StandardScaler sc = StandardScaler() X_train = sc.fit_transform(X_train) X_test = sc.transform(X_test) # Fitting classifier to the Training set # Create your classifier here # Predicting the Test set results y_pred = classifier.predict(X_test) # Making the Confusion Matrix
from sklearn.preprocessing import LabelEncoder join_data = pd.merge(data_jour, data_meteo, how='right', on=['jour', 'year', 'month', 'date']) join_data = join_data[~join_data['frequentation'].isnull()] join_data.to_csv('join_data.csv') y_meteo = join_data.frequentation X_meteo = join_data[[ 'sur_place', 'livraison', 'semaine', 'jour', 'year', 'month', 'date', 'time', 'icon', 'precipintensity', 'temperature', 'humidity', 'pressure' ]] X_train_meteo, X_test_meteo, y_train_meteo, y_test_meteo = train_test_split( X_meteo, y_meteo, train_size=0.9, random_state=2) # Select categorical columns with relatively low cardinality (convenient but arbitrary) categorical_cols = [ cname for cname in X_train_meteo.columns if X_train_meteo[cname].nunique() < 10 and X_train_meteo[cname].dtype == "object" ] # Select numerical columns numerical_cols = [ cname for cname in X_train_meteo.columns if X_train_meteo[cname].dtype in ['int64', 'float64'] ] numerical_transformer = SimpleImputer(strategy='median')
Choose any value of $k$ between 1 and 10. For different partitions of the full data set into training and testing sets (e.g., 10%/90%, 20%/80%, 30%/70%, etc.), obtain training and test error rates. Plot training and test error (on the same axes) vs. the proportion of training examples. Briefly comment on the insights that this plot yields. #### <font color="green">Solution 2</font> # imports from sklearn.model_selection import train_test_split import pandas as pd import altair as alt # calculation loop error_dict = {'train_proportion': [], 'train_error': [], 'test_error': []} # store results in a dictionary for split in np.arange(0.1, 1, 0.1): X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=split, random_state=123) # split data model = KNeighborsClassifier(n_neighbors=2).fit(X_train, y_train) # create model error_dict['train_proportion'].append(1 - split) # store the train proportion ("split") for plotting error_dict['train_error'].append(1 - model.score(X_train, y_train)) # store train error error_dict['test_error'].append(1 - model.score(X_test, y_test)) # store test error # plot with matplotlib plt.rcParams.update({'font.size': 16}) # update font size of plot plt.subplots(1, 1, figsize = (7, 5)) # create plot canvas plt.plot(error_dict['train_proportion'], error_dict['train_error'], '-b', label='train error') # plot the two lines plt.plot(error_dict['train_proportion'], error_dict['test_error'], '-r', label='test error') plt.grid() # the next few lines are formatting plt.title('Train proportion vs error') plt.xlabel('Train proportion') plt.ylabel('Error') plt.legend();
# 画像データを0-1の範囲に直す y=[] x=[] for d in data: (num, img) = data img = img.reshape(-1).astype("float")/255 y.append(keras.utils.np_utils.to_categorical(num,out_size)) x.append(img) x=np.array(x) y=np.array(y) # 学習用とテスト用に分離 x_train,x_test,y_train,y_test = train_test_split( x,y,test_size=0.2,train_size=0.8 ) # モデル構築を定義 Dense = keras.layers.Dense model = keras.models.Sequential() model.add(Dense(512,activation="relu",input_shape=(in_size,))) model.add(Dense(out_size,activation="softmax")) # モデルをコンパイルして学習を実行 model.compile( loss="categorical_crosentropy", optimizer="adam", metrics=['accuracy'] ) model.fit(
from sklear.metrics import confusion_matrix from matpotlib.colors import ListedColormap #data column = (userid, gender, age, estiatedsalary,purchased) data = pd.read_csv() data.head(10) real_x = data.iloc[:,[2,3]].values real_x real_y = data.iloc[:,4].values real_y # now to split the dataset traning_x, test_x,training_y,test_y = train_test_split(real_x,real_y, test_size=0.25, random_state=0) #to check the data set traning_x test_x #now we have to do feature scaling (Basically, when there would huge gap defference between independent variable, then feature scalling will transform the value under -2 to 2. scaler = StandardScaler() training_x = scaler.fit_transform(traninig_x) test_x = scaler.fit_transform(test_x) # Now we will make classifier classifier_LR= LogistiRegression(random_state=0) # now to train the model classifier_LR.fit(traning_x, training_y)
def load_and_split_data(data_handling_params): with open(data_handling_params['data_path'], 'rb') as f: data = pickle.load(f) with open(data_handling_params['labels_path'], 'rb') as f: labels = pickle.load(f) return train_test_split(data, labels, test_size=data_handling_params['test_percent'])
899 3 1 female 27.0 0 2 347742 11.1333 NaN S 15 NaN NaN 161 1 1 female 51.0 1 0 13502 77.9583 D11 S 10 NaN Hudson, NY 254 1 1 male NaN 0 0 19988 30.5000 C106 S 3 NaN Manchester, England 409 2 0 male 36.0 0 0 229236 13.0000 NaN S NaN 236.0 Rochester, NY In [12]: from sklean.model_selection import train_test_split --------------------------------------------------------------------------- ModuleNotFoundError Traceback (most recent call last) <ipython-input-12-e254ffd72cf7> in <module> ----> 1 from sklean.model_selection import train_test_split ModuleNotFoundError: No module named 'sklean' In [13]: from sklearn.model_selection import train_test_split In [14]: train, test = train_test_split(titanic, test_size=0.2) In [15]: import statsmodels.formula.api as smf In [16]: m = smf.logit(formula = 'survived ~ 1', data=train) In [17]: m = smf.logit(formula = 'survived ~ 1', data=train).fit() Optimization terminated successfully. Current function value: 0.665513 Iterations 4 In [18]: m.summary() Out[18]: <class 'statsmodels.iolib.summary.Summary'> """ Logit Regression Results