def fit(self, X, y): X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = self.test_size, random_state = self.random_state) dim = X.shape[1] self.indices_ = tuple(range(dim)) self.subsets_ = [self.indices_] score = self._cal_score(X_train, X_test, y_train, y_test,self.indices_) self.scores_ = [score] while dim > self.k_features: scores = [] subsets = [] for p in combinations(self.indices_, r = dim -1): score = self._cal_score(X_train, X_test, y_train, y_test,p) scores.append(score) subsets.append(p) best = np.argmax(scores) self.indices_ = subsets[best] self.subsets_.append(self.indices_) dim -= 1 self.scores_.append(scores[best]) self.k_score_ = self.scores_[-1] return self
def model_auto_tpot( df, colX, coly, outfolder="aaserialize/", model_type="regressor/classifier", train_size=0.5, generation=1, population_size=5, verbosity=2, ): """ Automatic training of Xmat--->Y, Generate SKlearn code in outfile Very Slow Process, use lower number of Sample :param Xmat: :param y: :param outfolder: :param model_type: :param train_size: :param generation: :param population_size: :param verbosity: :return: """ tpot = import_("tpot") X = df[colX].values y = df[coly].values X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.5, test_size=0.5) if model_type == "regressor": clf = tpot.TPOTRegressor( generations=generation, population_size=population_size, verbosity=verbosity ) elif model_type == "classifier": clf = tpot.TPOTClassifier( generations=generation, population_size=population_size, verbosity=verbosity ) print("Start") clf.fit(X_train, y_train) score = tpot.score(X_test, y_test) print("score", score) file1 = outfolder + "/tpot_regression_pipeline_" + str(np.random.randint(1000, 9999)) + ".py" tpot.export(file1) return file1
X = name_letters X = np.array(X).reshapre(-1, 1) y = np.where(labeled_names[ind, 1] == 'male', 0, 1) from sklearn import preprocessing lb = preprocessing.LabelBinarizer() lb.fit(X) X2 = lb.transform(X) from sklearn.preprocessing import train_test_split X_train2, y_train2, X_test2, y_test2 = train_test_split(X2, y, test_size=0.4, random_state=42) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42) from sklearn.naive_bayes import MultinomialNB from sklearn.metrics import accuracy_score, confusion_matrix clf = MultinomialNB.new(alpha=0.1, fit_prior=True) clf.fit(X_train2, y_train2) y_train_pred = clf.predict(x_train2) y_test_pred = clf.predict(x_test2)
def train_val_test(df, y): train, test1 = train_test_split(df, test_size=.70, stratify=y, random_state=42) val, test = train_test_split(test1, test_size=.50, stratify=y, random_stat=42) return train, val, test
def train_test(self, test_size): self.x_train, self.x_test, self.y_train, self.y_test = train_test_split( self.x, self.y, test_size=test_size, random_state=0)
X = dataset.ilocs[:, :-1].values y = dataset.ilocs[:, :4].values #Encoding Categorical Data from sklearn import LabelEncoder, OneHotEncoder labelencoder = LabelEncoder() X[:, 3] = labelencoder.fit_transform(X[:, 3]) onehotencoder = OneHotEncoder(categorical_features=[3]) X = onehotencoder.fit_transform(X).toarray() #exclure l'index 0 X = X[:, 1:] #Split Data Train and DataSet from sklearn.preprocessing import train_test_split X_train, y_train, X_test, y_test = train_test_split(X, y, test_size=1 / 3, random_state=0) from sklearn.Preprocessing import train_test_split X_train, y_train, X_test, y_test = train_test_split(X, y, test_seize=1 / 3, random_state=0) # Importing the libraries import numpy as np import matplotlib.pyplot as plt import pandas as pd # Importing the dataset dataset = pd.read_csv('50_Startups.csv')
r_matr = dataset.iloc[:, -1].values # Fill in missing data with mean method from sklearn.preprocessing import Imputer imputer = Imputer(missing_values="NaN", strategy="mean", axis=0) imputer = imputer.fit(f_matr[:, [1,2]]) f_matr[:, [1,2]] = imputer.transform(f_matr[:, [1,2]]) # Categorial Data splitting from sklearn.preprocessing import LabelEncoder, OneHotEncoder country_encoder = LabelEncoder() f_matr[:, 0] = country_encoder.fit_transform(f_matr[:, 0]) onehotencoder = OneHotEncoder(categorical_features=[0]) f_matr = onehotencoder.fit_transform(features_matrix).toarray() response_encoder = LabelEncoder() r_matr = response_encoder.fit_transform(r_matr) # Separating training and testing set from sklearn.preprocessing import train_test_split f_matr_train, f_matr_test,\ r_matr_train, r_matr_test = train_test_split(f_matr, r_matr, test_size=0.2, random_state=0) # Feature Scaling -- age and salary columns should be same range from sklearn.preprocessing import StandardScaler standard_scaler = StandardScaler() f_matr_train = standard_scaler.fit_transform(f_matr_train) f_matr_test = standard_scaler.fit_transform(f_matr_test)