Python train_test_split Examples

Example #1

0

Show file

File: colour_bayes.py Project: SpicyChill/cmpt353-Data-science

def main(infile):
    data = pd.read_csv(infile)
    X = np.array([data['R'],data['G'],data['B']]).transpose().reshape((-1,3)) # array with shape (n, 3). Divide by 255 so components are all 0-1.
    X = X/255
    y = np.array([data['Label']]).transpose() # array with shape (n,) of colour words.

    # TODO: build model_rgb to predict y from X.
    # TODO: print model_rgb's accuracy score

    # TODO: build model_lab to predict y from X by converting to LAB colour first.
    # TODO: print model_lab's accuracy score
    X_train, X_test, y_train, y_test = train_test_split( X, y)
    model_rgb = GaussianNB()
    model_rgb.fit(X_train, y_train)
    print(model_rgb.score(X_test, y_test))
    plot_predictions(model_rgb)
    plt.savefig('predictions_rgb.png')
    def covert_rgb(X):
        X = X.reshape(1,-1,3)
        rgb2lab(X)
        X = X.reshape(-1,3)
        return X
    model_lab = make_pipeline(
        FunctionTransformer(covert_rgb),
        GaussianNB()
    )
    model_lab.fit(X_train, y_train)
    print(model_lab.score(X_test, y_test))
    plot_predictions(model_lab)
    plt.savefig('predictions_lab.png')

Example #2

0

Show file

File: Perceptron base.py Project: LuciusK/algorithms

def main():
    iris = load_iris
    X = iris.data[:100, [0, 2]]
    y = iris.target[:100]
    y = np.where(y == 1, 1, -1)
    X_train, X_test, y_train, y_test = \
        train_test_split(X, y, test_size = 0.3)

    ppn = PerceptronBase(eta = 0.1, n_iter = 10)
    ppn.fit(X_train, y_train)

Example #3

0

Show file

def max_pool_2d(x, poolsize=(2,2)):	pooled_out = downsample.max_pool_2d( input = x, ds = poolsize, ignore_border = True	) return pooled_out
def embed_id(sentences=None, n_vocab=None, k_wrd=None): if sentences is None or n_vocab is None or k_wrd is None: return NotImplementedError()
	tmp = sentences.get_value(borrow=True)	max_sent_len = len(tmp[0])	x_wrd = [] for sentence in tmp:		word_mat = np.array([[0]*n_vocab]*(max_sent_len+k_wrd-1), dtype='int8')
		i = 0 for word in sentence:			word_mat[(k_wrd/2)+i][word] = 1			i += 1
		x_wrd.append(word_mat) return theano.shared(x_wrd, borrow=False)


class Result(object): def __init__(self, x, y): self.x = x self.y = y
 def negative_log_likelihood(self): self.prob_of_y_given_x = T.nnet.softmax(self.x) return -T.mean(T.log(self.prob_of_y_given_x)[T.arange(self.y.shape[0]), self.y])
 def cross_entropy(self): self.prob_of_y_given_x = T.nnet.softmax(self.x) return T.mean(T.nnet.categorical_crossentropy(self.prob_of_y_given_x, self.y))
 def mean_squared_error(self): return T.mean((self.x - self.y) ** 2)
 def errors(self): if self.y.ndim != self.y_pred.ndim: raise TypeError('y should have the same shape as self.y_pred',							('y', self.y.type, 'y_pred', self.y_pred.type))
 if self.y.dtype.startswith('int'): self.prob_of_y_given_x = T.nnet.softmax(self.x) self.y_pred = T.argmax(self.prob_of_y_given_x, axis=1) return T.mean(T.neq(self.y_pred, self.y)) else: return NotImplementedError()
 def accuracy(self): if self.y.dtype.startswith('int'): self.prob_of_y_given_x = T.nnet.softmax(self.x) self.y_pred = T.argmax(self.prob_of_y_given_x, axis=1) return T.mean(T.eq(self.y_pred, self.y)) else: return NotImplementedError()
def load_data(random_state=0): print 'fetch MNIST dataset'	mnist = fetch_mldata('MNIST original')	mnist.data   = mnist.data.astype(np.float32)	mnist.data  /= 255	mnist.target = mnist.target.astype(np.int32)
	data_train,\	data_test,\	target_train,\	target_test\ = train_test_split(mnist.data, mnist.target, random_state=random_state)
 def shared_data(x,y):		shared_x = theano.shared(np.asarray(x, dtype=theano.config.floatX), borrow=True)		shared_y = theano.shared(np.asarray(y, dtype=theano.config.floatX), borrow=True)
 return shared_x, T.cast(shared_y, 'int32')
	data_train, target_train = shared_data(data_train, target_train)	data_test, target_test = shared_data(data_test, target_test)
 return ([data_train, data_test], [target_train, target_test])
def load_livedoor_news_corpus(random_state=0, test_size=0.1): import six.moves.cPickle as pickle	data_, target_ = pickle.load(open('dataset', 'rb'))
	data_train,\	data_test,\	target_train,\	target_test\ = train_test_split(data_, target_, random_state=random_state, test_size=test_size)
 def shared_data(x,y):		shared_x = theano.shared(np.asarray(x, dtype=theano.config.floatX), borrow=True)		shared_y = theano.shared(np.asarray(y, dtype=theano.config.floatX), borrow=True)
 return shared_x, T.cast(shared_y, 'int32')
	data_train, target_train = shared_data(data_train, target_train)	data_test, target_test = shared_data(data_test, target_test)
 return ([data_train, data_test], [target_train, target_test])

def build_shared_zeros(shape, name): """ Builds a theano shared variable filled with a zeros numpy array """ return theano.shared( value=np.zeros(shape, dtype=theano.config.floatX),  name=name,  borrow=True    )

Example #4

0

Show file

import numpy as np 
import cv2
import glob
import sklearn.cross_validation import train_test_split 


## Loads the labels 
labels = pd.read_table("/data/dr/trainLabels.csv",sep=",",index_col=["image"])
labels.drop("420_right")



filelist = glob.glob("/data/dr/data/sample_270_270/*.jpeg")

#Stratified sampling for dividing data into X_train and X_valid 
x_train,x_valid,y_train, y_valid = train_test_split(filelist,labels,
  test_size = 0.10, random_state = 20)


## train_test split 
x_train,x_test,y_train, y_test = train_test_split(x_train,y_train,
  test_size = 0.10, random_state = 20)


#read the x_valid and y_valid images 
x_valid = np.array([cv2.imread("/data/dr/data/sample_270_270/"+i+".jpeg") for i in y_valid.index])
y_valid = np.array(pd.get_dummies(y_valid["level"]))

model_train = model((270,270,3))

"""

Example #5

0

Show file

클래스를 이용해 함수(처리부분)와 변수(데이터 부분)를
하나로 묶어 객체(인스턴스)를 생성해 사용한다는 점이다.

    객체: 실제 존재하는 모든 사물 또는 개념
    클래스: 객체를 정의해 놓은 것
    인스턴스: 객체와 비슷. 클래스로부터 객체를 만드는 과정을 
            '클래스의 인스턴스화'라고 부름

            객체 - 핸드폰 
            클래스 - 핸드폰 설계도
            인스턴스화 - 핸드폰 설계도로부터 핸드폰을 만드는 과정

from scklearn import datasets
import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size=0.25, random_state=321)
parameter
array: 분할시킬 데이터를 입력
test_size: 테스트 데이터셋의 비율이나 갯수
trian_size: 학습 데이터셋의 비율이나 갯수
random_state: 데이터 분할시 셔플이 이루어지는데 이를 위한 시드값
shuffle:셔플여부 설정

from sklearn.feature_selection import SelectKBest, f_classif

SelectKBest: k highest점수에 따라 특징을 선택한다.
parameter
score_func: callable
k: int or "all",optional, default=10

머신러닝 라이브러리

Example #6

0

Show file

File: solution.py Project: naimdjon/ma120_2018

# coding: utf-8
import pandas
data=pandas.read_csv('housing_oslo.csv')
data.head()
X=data.drop('Price',1)
X.head(2)
Y=data['Price']
Y.head(2)
import sklearn.model_selection import train_test_split
from sklearn.model_selection import train_test_split
X_train,X_test, y_train, y_test = train_test_split(X,Y,random_state=0)
X_train.head(2)
from sklearn.linear_model import LinearRegression
regression=LinearRegression()
regression.fit(X_train,y_train)
import numpy
new_data_item=numpy.array([[168,202,4,3,4,2014,0,171,1,1]])
regression.predict(new_data_item)

Example #7

0

Show file

File: classified_malaria-detect.py Project: ishani-chakraborty/malaria_detection

import pandas as pd
import sklearn.model_selection import train_test_split
import sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
import sklearn import metrics
import joblib

## load dataset.csv
dataframe = pd.read_csv("csv/dataset.csv")
print(dataframe.head())

#Split into training and test data
 data_x = dataframe.drop(["Label"], axis=1)
 data_y = dataframe["Label"]
 trained_x, test_x, trained_y, test_y = train_test_split(data_x, data_y, test_size=0.2, random_state-4)

 ## Build the model
 model = RandomForestClassifier(num_estimators=100, max_depth=5)
 model.fit(trained_x,trained_y)
 joblib.dump(model, "rf_malaria_100_5")

 # Create and build predictions and get classification report on the trained data
 predictions = model.predict(test_x)
 print(metrics.classification_report(predictions, test_y ))

Example #8

0

Show file

File: logistic_regression.py Project: senthuzen/python-project-for-education-

In [3]: # remove duplicated columns^M
   ...: remove = []
   ...: cols = train.columns
   ...: for i in range(len(cols)-1):
   ...:     v = train[cols[i]].values
   ...:     for j in range(i+1,len(cols)):
   ...:         if np.array_equal(v,train[cols[j]].values):
   ...:             remove.append(cols[j])
   ...: train.drop(remove, axis=1, inplace=True)
   ...: test.drop(remove, axis=1, inplace=True)
   ...: # split data into train and test^M
   ...: test_id = test.ID
   ...: test = test.drop(["ID"],axis=1)
   ...: X = train.drop(["TARGET","ID"],axis=1)
   ...: y = train.TARGET.values
   ...: X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=1729)
   ...: print(X_train.shape, X_test.shape, test.shape)
   ...: ## # Feature selection
   ...: clf = ExtraTreesClassifier(random_state=1729)
   ...: selector = clf.fit(X_train, y_train)
In [6]: feat_imp = pd.Series(clf.feature_importances_, index = X_train.columns.values).sort_values(ascending=False)
   ...: feat_imp[:40].plot(kind='bar', title='Feature Importances according to ExtraTreesClassifier', figsize=(12, 8))
   ...: plt.ylabel('Feature Importance Score')
   ...: plt.subplots_adjust(bottom=0.3)
   ...: plt.savefig('1.png')
   ...: plt.show()
   ...: # clf.feature_importances_ 
   ...: fs = SelectFromModel(selector, prefit=True)
   ...: X_train = fs.transform(X_train)
   ...: X_test = fs.transform(X_test)
   ...: test = fs.transform(test)

Example #9

0

Show file

# Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import train_test_split

# Importing the dataset
dataset = pd.read_csv('Social_Network_Ads.csv')
X = dataset.iloc[:, [2, 3]].values
y = dataset.iloc[:, 4].values

# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.25,
                                                    random_state=0)

# Feature Scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

# Fitting classifier to the Training set
# Create your classifier here

# Predicting the Test set results
y_pred = classifier.predict(X_test)

# Making the Confusion Matrix

Example #10

0

Show file

from sklearn.preprocessing import LabelEncoder

join_data = pd.merge(data_jour,
                     data_meteo,
                     how='right',
                     on=['jour', 'year', 'month', 'date'])
join_data = join_data[~join_data['frequentation'].isnull()]
join_data.to_csv('join_data.csv')
y_meteo = join_data.frequentation

X_meteo = join_data[[
    'sur_place', 'livraison', 'semaine', 'jour', 'year', 'month', 'date',
    'time', 'icon', 'precipintensity', 'temperature', 'humidity', 'pressure'
]]

X_train_meteo, X_test_meteo, y_train_meteo, y_test_meteo = train_test_split(
    X_meteo, y_meteo, train_size=0.9, random_state=2)

# Select categorical columns with relatively low cardinality (convenient but arbitrary)
categorical_cols = [
    cname for cname in X_train_meteo.columns
    if X_train_meteo[cname].nunique() < 10
    and X_train_meteo[cname].dtype == "object"
]

# Select numerical columns
numerical_cols = [
    cname for cname in X_train_meteo.columns
    if X_train_meteo[cname].dtype in ['int64', 'float64']
]

numerical_transformer = SimpleImputer(strategy='median')

Example #11

0

Show file

File: assignment1_solutions.py Project: Billizard-Y/BAIT509

Choose any value of $k$ between 1 and 10. For different partitions of the full data set into training and testing sets (e.g., 10%/90%, 20%/80%, 30%/70%, etc.), obtain training and test error rates. Plot training and test error (on the same axes) vs. the proportion of training examples. Briefly comment on the insights that this plot yields.


#### <font color="green">Solution 2</font>

# imports
from sklearn.model_selection import train_test_split
import pandas as pd
import altair as alt
# calculation loop
error_dict = {'train_proportion': [],
              'train_error': [],
              'test_error': []} # store results in a dictionary
for split in np.arange(0.1, 1, 0.1):
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=split,
                                                        random_state=123) # split data
    model = KNeighborsClassifier(n_neighbors=2).fit(X_train, y_train) # create model
    error_dict['train_proportion'].append(1 - split) # store the train proportion ("split") for plotting
    error_dict['train_error'].append(1 - model.score(X_train, y_train)) # store train error
    error_dict['test_error'].append(1 - model.score(X_test, y_test)) # store test error
# plot with matplotlib
plt.rcParams.update({'font.size': 16}) # update font size of plot
plt.subplots(1, 1, figsize = (7, 5)) # create plot canvas
plt.plot(error_dict['train_proportion'], error_dict['train_error'], '-b', label='train error')  # plot the two lines
plt.plot(error_dict['train_proportion'], error_dict['test_error'], '-r', label='test error')
plt.grid() # the next few lines are formatting
plt.title('Train proportion vs error')
plt.xlabel('Train proportion')
plt.ylabel('Error')
plt.legend();

Example #12

0

Show file

File: simple_nn_model.py Project: devinoue/Katakana_OCR

# 画像データを0-1の範囲に直す
y=[]
x=[]

for d in data:
	(num, img) = data
	img = img.reshape(-1).astype("float")/255
	y.append(keras.utils.np_utils.to_categorical(num,out_size))
	x.append(img)

x=np.array(x)
y=np.array(y)

# 学習用とテスト用に分離
x_train,x_test,y_train,y_test = train_test_split(
	x,y,test_size=0.2,train_size=0.8
)

# モデル構築を定義
Dense = keras.layers.Dense
model = keras.models.Sequential()
model.add(Dense(512,activation="relu",input_shape=(in_size,)))
model.add(Dense(out_size,activation="softmax"))

# モデルをコンパイルして学習を実行
model.compile(
	loss="categorical_crosentropy",
	optimizer="adam",
	metrics=['accuracy']
)
model.fit(

Example #13

0

Show file

File: LogisticRegression.py Project: RKR-DataScientist/Machine-Learning-in-30-days

from sklear.metrics import confusion_matrix
from matpotlib.colors import ListedColormap

#data column = (userid, gender, age, estiatedsalary,purchased)
data = pd.read_csv()
data.head(10)

real_x = data.iloc[:,[2,3]].values
real_x

real_y = data.iloc[:,4].values
real_y


# now to split the dataset
traning_x, test_x,training_y,test_y = train_test_split(real_x,real_y, test_size=0.25, random_state=0)
#to check the data set
traning_x
test_x

#now we have to do feature scaling (Basically, when there would huge gap defference between independent variable, then feature scalling  will transform the value under -2 to 2.

scaler = StandardScaler()
training_x = scaler.fit_transform(traninig_x)
test_x = scaler.fit_transform(test_x)

# Now we will make classifier
classifier_LR= LogistiRegression(random_state=0)

# now to train the model
classifier_LR.fit(traning_x, training_y)

Example #14

0

Show file

def load_and_split_data(data_handling_params):
    with open(data_handling_params['data_path'], 'rb') as f:
        data = pickle.load(f)
    with open(data_handling_params['labels_path'], 'rb') as f:
        labels = pickle.load(f)
    return train_test_split(data, labels, test_size=data_handling_params['test_percent'])

Example #15

0

Show file

File: inc10b.py Project: bmaxwell99/INFO-370

899        3         1  female  27.0      0      2      347742  11.1333   NaN        S   15    NaN                  NaN
161        1         1  female  51.0      1      0       13502  77.9583   D11        S   10    NaN           Hudson, NY
254        1         1    male   NaN      0      0       19988  30.5000  C106        S    3    NaN  Manchester, England
409        2         0    male  36.0      0      0      229236  13.0000   NaN        S  NaN  236.0        Rochester, NY

In [12]: from sklean.model_selection import train_test_split
---------------------------------------------------------------------------
ModuleNotFoundError                       Traceback (most recent call last)
<ipython-input-12-e254ffd72cf7> in <module>
----> 1 from sklean.model_selection import train_test_split

ModuleNotFoundError: No module named 'sklean'

In [13]: from sklearn.model_selection import train_test_split

In [14]: train, test = train_test_split(titanic, test_size=0.2)

In [15]: import statsmodels.formula.api as smf

In [16]: m = smf.logit(formula = 'survived ~ 1', data=train)

In [17]: m = smf.logit(formula = 'survived ~ 1', data=train).fit()
Optimization terminated successfully.
         Current function value: 0.665513
         Iterations 4

In [18]: m.summary()
Out[18]: 
<class 'statsmodels.iolib.summary.Summary'>
"""
                           Logit Regression Results