sample_df['Model'] = df['Model'] # We hot encoding the cylinder columns origin_dummies = pd.get_dummies(df['Cylinders']) # Add origin encoding for origin_column in list(origin_dummies): sample_df[origin_column] = origin_dummies[origin_column] X_train, X_test, y_train, y_test = train_test_split(sample_df, labels, train_size=0.7) le = preprocessing.LabelEncoder() tpot = TPOTClassifier(generations=7, population_size=15, verbosity=2) tpot.fit(X_train, y_train) print(tpot.score(X_test, y_test)) tpot.export('tpot_cars_pipeline.py') #tpot = TPOTClassifier(generations=5, population_size=20, verbosity=2) #Best pipeline: GradientBoostingClassifier(RobustScaler(input_matrix), GradientBoostingClassifier__learning_rate=1.0, GradientBoostingClassifier__max_depth=5, GradientBoostingClassifier__max_features=0.25, GradientBoostingClassifier__min_samples_leaf=DEFAULT, GradientBoostingClassifier__min_samples_split=17, GradientBoostingClassifier__n_estimators=100, GradientBoostingClassifier__subsample=0.7) # 0.770491803279 #tpot = TPOTClassifier(generations=5, population_size=50, verbosity=2) #Best pipeline: ExtraTreesClassifier(input_matrix, ExtraTreesClassifier__bootstrap=False, ExtraTreesClassifier__criterion=DEFAULT, ExtraTreesClassifier__max_features=0.45, ExtraTreesClassifier__min_samples_leaf=1, ExtraTreesClassifier__min_samples_split=7, ExtraTreesClassifier__n_estimators=DEFAULT) #0.762295081967 #Sin MPG #tpot = TPOTClassifier(generations=5, population_size=20, verbosity=2) #Best pipeline: ExtraTreesClassifier(input_matrix, ExtraTreesClassifier__bootstrap=DEFAULT, ExtraTreesClassifier__criterion=gini, ExtraTreesClassifier__max_features=0.45, ExtraTreesClassifier__min_samples_leaf=1, ExtraTreesClassifier__min_samples_split=6, ExtraTreesClassifier__n_estimators=DEFAULT)
X_train.shape, X_test.shape, y_train.shape, y_test.shape # In[3]: from config.classifier_models_only import classifier_config_dict time_allocated = 60 # In[4]: tpot = TPOTClassifier( max_time_mins=time_allocated, config_dict=classifier_config_dict, verbosity=3, scoring="neg_log_loss", n_jobs=8) tpot.fit(X_train, y_train) print(tpot.score(X_test, y_test)) # In[4]: # tpot.export('tpot_mnist_pipeline.py') # In[ ]:
'sklearn.feature_selection.SelectFromModel': { 'threshold': np.arange(0, 1.01, 0.05), 'estimator': { 'sklearn.ensemble.ExtraTreesClassifier': { 'n_estimators': [100], 'criterion': ['gini', 'entropy'], 'max_features': np.arange(0.05, 1.01, 0.05) } } } } # generations 确定子代的迭代次数 # population_size=10 是创建个体的初始数量 # offspring_size 每一代所需创造个体数 # crossover_rate 用于创造后代的个体所占的百分比 # mutation_rate 属性值随机更改的概率 # 基于遗传算法的一个东西 tpot = TPOTClassifier(generations=1, population_size=10, verbosity=2, config_dict=tpot_config) tpot.fit(X_train, y_train) tpot.score(X_test, y_test) tpot.export('/Users/sheng/PycharmProjects/untitled/guowei/chishi.py') #tpot.score() # tpot.export(result.py) 导出标准的scikit-learn代码
"""**7. Selecting model using TPOT** """ # Import TPOTClassifier and roc_auc_score from tpot import TPOTClassifier from sklearn.metrics import roc_auc_score # Instantiate TPOTClassifier tpot = TPOTClassifier( generations=5, population_size=20, verbosity=2, scoring='roc_auc', random_state=42, disable_update_check=True, config_dict='TPOT light' ) tpot.fit(X_train, y_train) # AUC score for tpot model tpot_auc_score = roc_auc_score(y_test, tpot.predict_proba(X_test)[:, 1]) print(f'\nAUC score: {tpot_auc_score:.4f}') print('\nbest pipeline steps:',end='\n') for idx, (name, transform) in enumerate(tpot.fitted_pipeline_.steps, start=1): print(f'{idx}. {transform}') """**8. Checking the variance**
data = pd.read_excel(path, header=1, index_col=0) data = data.rename(columns={'default payment next month': "default"}) print2(data.head()) X_train, X_test, y_train, y_test = train_test_split(data.iloc[:, :-1], data.iloc[:, -1], stratify=data.iloc[:, -1], test_size=0.3) print2(X_train.shape, X_test.shape, y_train.shape, y_test.shape) # Assign the values outlined to the inputs number_generations = 3 population_size = 5 offspring_size = 10 scoring_function = "accuracy" # Create the tpot classifier tpot_clf = TPOTClassifier(generations=number_generations, population_size=population_size, offspring_size=offspring_size, scoring=scoring_function, verbosity=2, random_state=2, cv=2) # Fit the classifier to the training data tpot_clf.fit(X_train, y_train) # Score on the test set print(tpot_clf.score(X_test, y_test))
import numpy as np #load the data telescope = pd.read_csv('MAGIC Gamma Telescope Data.csv') #clean the data telescope_shuffle = telescope.iloc[np.random.permutation(len(telescope))] tele = telescope_shuffle.reset_index(drop=True) #Store 2 classes tele['Class'] = tele['Class'].map({'g': 0, 'h': 1}) tele_class = tele['Class'].values #Split training, testing, and validation data training_indices, validation_indices = training_indices, testing_indices = train_test_split( tele.index, stratify=tele_class, train_size=0.75, test_size=0.25) #Let Genetic Programming find best ML model and hyperparameters tpot = TPOTClassifier(generations=5, verbosity=2) tpot.fit( tele.drop('Class', axis=1).loc[training_indices].values, tele.loc[training_indices, 'Class'].values) #Score the accuracy tpot.score( tele.drop('Class', axis=1).loc[validation_indices].values, tele.loc[validation_indices, 'Class'].values) #Export the generated code tpot.export('pipeline.py')
y_test = Label(y_test_string) print(sum(y_test==1)) print(len(y_test)) print('percentage of not animals=',(sum(y_test==1)-len(y_test))/len(y_test)) """ -------------- TPOT does is magic------------------------------------- """ print(sum(y_test==1)) print(len(y_test)) print('percentage of not animals=',(sum(y_test==1)-len(y_test))/len(y_test)) from tpot import TPOTClassifier clf=TPOTClassifier(verbosity=2,n_jobs=-1) clf.fit(X_train,y_train) print(sum(y_test==1)) print(len(y_test))' print('percentage of not animals=',(sum(y_test==1)-len(y_test))/len(y_test)) print('test score=',clf.score(X_test,y_test)) predictions = clf.predict(X_test) print(confusion_matrix(y_test,predictions)) #digits=load_digits() #X=digits['data']
def tpot(self): from tpot import TPOTClassifier tpot = TPOTClassifier(generations=5, population_size=20, verbosity=2) tpot.fit(self.train_data, self.train_label) print(tpot.score(self.predi_data, self.predi_label))
y = df['Result_of_Treatment'] y.head() X = df.drop('Result_of_Treatment', axis=1) X.shape, y.shape X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) X_train.shape X_test.shape y_train.shape scaler = StandardScaler() X_scaled = scaler.fit_transform(X) X_scaled.shape clf = LogisticRegression() svmclf = SVC(kernel='rbf') rfclf = RandomForestClassifier() tpotclf = TPOTClassifier() model1 = clf.fit(X_train, y_train) model2 = svmclf.fit(X_train, y_train) model3 = rfclf.fit(X_train, y_train) model_auto_clf = tpotclf.fit(X_train) score = cross_val_score(clf, X_train, y_train) score2 = cross_val_score(svmclf, X_train, y_train) score3 = cross_val_score(rfclf, X_train, y_train) print("score is:%.2f\n,", score3) ##Tpot classifier tpotclf = TPOTClassifier(generations=5, cv=5) model_tpot_clf = tpotclf.fit(X_train, y_train) score = tpotclf.score(X_test, y_test) print(score) tpotclf.export('classifier-pipeline.py')
label_type = parse_dataset.label_type(y, stated_input_column) score_type = "accuracy" if label_type == "Ordinal": models = model_info.fast_ordinal_models() score_type = "r2" tpot = None if label_type == "Ordinal": tpot = TPOTRegressor(generations=5, population_size=20, verbosity=2, max_eval_time_mins=40, scoring='r2') else: tpot = TPOTClassifier(generations=5, population_size=20, verbosity=2, max_eval_time_mins=40) @timeout(7200) def train_tpot_model(x, y, tpot): X_train, X_test, y_train, y_test = train_test_split(x[0:10000], y[0:10000], train_size=0.8, test_size=0.2) tpot.fit(np.array(X_train), np.array(y_train)) return tpot error = None try:
tr_Y = deepcopy(train_Y).astype(np.int32) else: tr_X = deepcopy(train_X) tr_Y = deepcopy(train_Y).astype(np.int32) print(tr_X.shape, tr_Y.shape) for a in range(int(args.augs)): current_aug = train_X * np.random.normal( size=train_X.shape, loc=1, scale=0.1) tr_X = np.concatenate([tr_X, current_aug]).astype(np.float32) tr_Y = np.concatenate([tr_Y, train_Y]).astype(np.int32) print(tr_X.shape, tr_Y.shape) if "1hot" not in args.dataset: tpot = TPOTClassifier(generations=20, population_size=5, verbosity=2, scoring="balanced_accuracy", cv=10, config_dict="TPOT light", random_state=int(args.seed)) tpot.fit(tr_X, tr_Y) tr_Yhat = tpot.fitted_pipeline_.predict(tr_X) train_Yhat = tpot.fitted_pipeline_.predict(train_X) test_Yhat = tpot.fitted_pipeline_.predict(test_X) tpot.export(args.output + ".py") joblib.dump(tpot.fitted_pipeline_, args.output + ".joblib") else: auto = AutoNetImageClassification("medium_cs", log_level='info', max_runtime=100, min_budget=15, max_budget=45)
print("Data X contains NaN values") df.dropna() # In[18]: n = 50 B = np.array_split(df,n) # In[ ]: pipeline_optimizer = TPOTClassifier(generations = 100, warm_start = True, verbosity=2, max_time_mins=60, early_stop = 5) #f= open("Test_scores.txt","a+") #Initialization X = B[0].iloc[:,0:-1] y = B[0].iloc[:,-1] start = time.time() pipeline_optimizer.fit(X, y) for i in range(1,n): X = B[i].iloc[:,0:-1] y = B[i].iloc[:,-1] accuracy = pipeline_optimizer.score(X, y)
Xtest = scaler.transform(Xtest) n_neigh = 27 print('n adasyn', n_neigh) ada = ADASYN(random_state=91, n_neighbors=n_neigh, sampling_strategy=1, n_jobs=6) Xtrain, ytrain = ada.fit_resample(Xtrain, ytrain) '''Optimización RF''' tpot_classifier = TPOTClassifier( generations=5, population_size=10, offspring_size=5, verbosity=2, early_stop=3, config_dict={'sklearn.ensemble.RandomForestClassifier': parameters}, cv=4, scoring='roc_auc', n_jobs=12) '''Ajuste del modelo''' tpot_classifier.fit(Xtrain, ytrain) '''Predicción''' rf_y_pred = tpot_classifier.predict(Xtest) rf_y_prob = [probs[1] for probs in tpot_classifier.predict_proba(Xtest)] ypred_df = pd.DataFrame(rf_y_pred, columns=['Label pred']) pathy_pred = 'C:/Users/jkgv1/OneDrive/Escritorio/' + 'ypred' + 'fold' + str( fold) + '.xlsx'
#!/usr/bin/env python # -*- coding: utf-8 -*- import os import pandas as pd import numpy as np import matplotlib.pyplot as plt from sklearn.model_selection import KFold, cross_val_score from sklearn.metrics import f1_score import lightgbm as lgb from tpot import TPOTClassifier from sklearn import preprocessing if __name__ == "__main__": train_df = pd.read_csv('./new_train.csv', index_col=0) X_train = train_df.drop(columns=['type']).values y_train = train_df['type'].values le = preprocessing.LabelEncoder() y_train = le.fit_transform(y_train) pipeline_optimizer = TPOTClassifier(generations=100, population_size=100, cv=5, scoring='f1_macro', random_state=42, verbosity=2, n_jobs=8) pipeline_optimizer.fit(X_train, y_train) pipeline_optimizer.export('tpot_exported_pipeline.py')
X = df.drop('event', axis=1) y = df.event # Encode y like this np.sort(y.unique()) y = y.astype('category').cat.codes # %% X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.75, random_state=42) # %% # PCA # Scale data first scale = StandardScaler() X_train_scaled = scale.fit_transform(X_train) X_test_scaled = scale.transform(X_test) pca = PCA(n_components=30) X_train_pca = pca.fit_transform(X_train_scaled) X_test_pca = pca.transform(X_test_scaled) # %% tpot = TPOTClassifier(verbosity=2, random_state=42) tpot.fit(X_train_pca, y_train) print(tpot.score(X_test_pca, y_test)) tpot.export('tpot_project_pipeline.py')
def prediction(self): """ :return: LR模型的准确率为80.92% 交叉验证的准确率为80.59% 参数为:penalty='l1', tol=1e-6 bagging_cart模型的准确率为96.52% 交叉验证的准确率为80.25% 参数为:n_estimators=20, max_samples=0.8, max_features=1.0, bootstrap=True, bootstrap_features=False, n_jobs=-1 TOPT模型的准确率为90.24% 选择的最佳参数为81.48%:RandomForestClassifier(input_matrix, bootstrap=True, criterion=entropy, max_features=0.8500000000000001, min_samples_leaf=1, min_samples_split=16, n_estimators=100) """ print('-' * 30 + '乘 客 幸 存 情 况 预 测' + '-' * 30) train_data = self.train_data.filter( regex= 'Survived|Age_scaled|SibSp|Parch|Fare_scaled|Embarked_.*|Sex_.*|Pclass_.*' ) train_data = train_data.as_matrix() test_data = self.test_data.filter( regex= 'Age_scaled|SibSp|Parch|Fare_scaled|Embarked_.*|Sex_.*|Pclass_.*') test_data = test_data.as_matrix() train_x = train_data[:, 1:] train_y = train_data[:, 0] # 使用逻辑回归模型 model_lr = LogisticRegression(penalty='l1', tol=1e-6) model_lr.fit(train_x, train_y) predictions = model_lr.predict(test_data) result = pd.DataFrame({ 'PassenderId': self.test_data['PassengerId'].as_matrix(), 'Survived': predictions.astype(np.int32) }) print(result[:5]) # 使用训练集得到模型准确率 predictions = model_lr.predict(train_x) print('lr模型准确率为%.4lf' % accuracy_score(train_y, predictions)) print('lr模型使用交叉验证的准确率为%.4lf\n\n' % np.mean(cross_val_score(model_lr, train_x, train_y, cv=10))) # 使用决策树进行模型融合 model_cart = DecisionTreeClassifier() bagging_cart = BaggingClassifier(model_cart, n_estimators=20, max_samples=0.8, max_features=1.0, bootstrap=True, bootstrap_features=False, n_jobs=-1) bagging_cart.fit(train_x, train_y) # 使用训练集得出模型准确率 predictions = bagging_cart.predict(train_x) print('bagging_cart模型准确率为%.4lf' % accuracy_score(train_y, predictions)) print('bagging_cart模型使用交叉验证的准确率为%.4lf\n\n' % np.mean(cross_val_score(bagging_cart, train_x, train_y, cv=3))) # 使用TPOT模型 model_tpot = TPOTClassifier(generations=5, population_size=20, verbosity=2) model_tpot.fit(train_x, train_y) # 使用训练集得出模型准确率 predictions = model_tpot.predict(train_x) print('TPOT模型准确率为%.4lf' % accuracy_score(train_y, predictions)) print('TPOT模型使用交叉验证的准确率为%.4lf\n\n' % np.mean(cross_val_score(model_tpot, train_x, train_y, cv=3)))
def genetic_algorithm(X_train, X_test, y_train, y_test): from tpot import TPOTClassifier tpot = TPOTClassifier(generations=100, population_size=20, verbosity=2) tpot.fit(X_train, y_train) print(tpot.score(X_test, y_test))
from tpot import TPOTClassifier from tools import prepare_dataset y, x = prepare_dataset() x_train = x[:614] y_train = y[:614].reshape(-1, ) x_valid = x[614:] y_valid = y[614:].reshape(-1, ) pipeline_optimizer = TPOTClassifier(generations=50, population_size=20, cv=5, random_state=42, verbosity=2) pipeline_optimizer.fit(x_train, y_train) print(pipeline_optimizer.score(x_train, y_train)) pipeline_optimizer.export('tpot_exported_pipeline.py')
""" # TAKE SAMPLE DATA AND CAPTURE SIZE ------------------------------------------ x_train, x_test, y_train, y_test = train_test_split(train, y, train_size=0.00025) # capture the number of rows and features in the train dataset to be added in our results xt_nrows = int(x_train.shape[0]) xt_numb_feats = int(x_train.shape[1]) # BUILD AND RUN THE TPOT ------------------------------------------------------- my_tpot = TPOTClassifier(generations=25, population_size=50, n_jobs=2, verbosity=2, scoring=score_type, cv=5, random_state=1776, warm_start=True) my_tpot.fit(x_train, y_train) # DETERMINE BEST CV SCORE PIPELINE ---------------------------------------------- best_pipes = my_tpot.pareto_front_fitted_pipelines_ len_best_pipes = len(best_pipes) best_pipe_key = list( best_pipes.keys())[(len_best_pipes - 1)] # key is entire pipeline as string best_cv = abs(my_tpot.evaluated_individuals_[best_pipe_key][1])
df = pd.read_csv('data/datalab_persona_run1_with_scale_cat.csv') target = df['FKSmoker'].values df.drop(['FKSmoker'], inplace=True, axis=1) cols = [ x for x in df.columns.values if x not in ['Age Next at DOC', 'Height', 'Weight', 'Annual Salary', 'Travel %'] ] df = pd.get_dummies(df, columns=cols) data = df.values X_train, X_test, y_train, y_test = train_test_split(data, target, train_size=0.75, test_size=0.25) tpot = TPOTClassifier(generations=100, population_size=100, verbosity=2, n_jobs=2, config_dict='TPOT sparse', scoring='balanced_accuracy') tpot.fit(X_train, y_train) print(tpot.score(X_test, y_test)) tpot.export('tpot_fmi_pipeline_sparse_100_100_cat.py')
def train(self): print('in tpot training') try: # Storing save location for models dump_file = os.path.join( AUTO_ML_MODELS_PATH, 'tpot_' + str(datetime.datetime.now().strftime('%Y-%m-%d %H-%M-%S')) + '.dump') x = numpy.load( os.path.join(AUTO_ML_DATA_PATH, self.training_data_filename)) y = numpy.load( os.path.join(AUTO_ML_DATA_PATH, self.training_labels_filename)) if self.preprocessing_object.input_data_type == 'png': x = reformat_data(x) # training the models print('about to train') model = TPOTClassifier( # verbosity=2, max_time_mins=90, max_eval_time_mins=5, config_dict='TPOT light', population_size=4, generations=3, n_jobs=1) generations=self.generations, population_size=self.population_size, offspring_size=self.offspring_size, mutation_rate=self.mutation_rate, crossover_rate=self.crossover_rate, scoring=self.scoring, cv=self.cv, subsample=self.subsample, n_jobs=self.n_jobs, max_time_mins=self.max_time_mins, # Tpot takes input in mins while most other frameworks take inputs in seconds. max_eval_time_mins=self.max_eval_time_mins, random_state=self.random_state, config_dict=self.config_dict, warm_start=self.warm_start, memory=self.memory, use_dask=self.use_dask, early_stop=self.early_stop, verbosity=self.verbosity, disable_update_check=self.disable_update_check) print('before training start') start = time.time() model.fit(x, y) end = time.time() print('training finnished') with open(dump_file, 'wb') as f: print('about to save!') pickle.dump(model.fitted_pipeline_, f) print('model saved') self.training_time = round(end - start, 2) self.model_path = dump_file self.status = 'success' self.save() self.additional_remarks = str(model.fitted_pipeline_) self.save() except Exception as e: end = time.time() if 'start' in locals(): self.training_time = round(end - start, 2) self.status = 'fail' self.additional_remarks = e self.save()
features[key] = value # append dictionaries of each line to a meta list to be transormed # to a DataFrame meta_list.append(features) return pd.DataFrame(meta_list) sparse_train_df = dense2sparse(train, test) sparse_test_df = dense2sparse(train, test, is_test=True) ###################################################################### # feature engineering ####################################################################### # TPOT my_tpot = TPOTClassifier(generations=10) my_tpot = my_tpot.fit(sparse_train_df, label) tpot_output = my_tpot.predict(sparse_test_df) # RandomForestClassifier rf = RandomForestClassifier(n_estimators=100) forest = rf.fit(sparse_train_df, label) output = forest.predict(sparse_test_df) np.savetxt(r'submissions/submission1.txt', output, fmt='%s')
]], dtype=int) with open("./classification_tables.pkl", 'rb') as fp: datasets = pickle.load(fp) data = datasets[0]['data'] X = data.iloc[:, :-1].to_numpy(dtype=int) y = data.iloc[:, -1].to_numpy(dtype=int) X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7) tpot = TPOTClassifier(generations=20, population_size=20, n_jobs=8, verbosity=2, scoring='balanced_accuracy') tpot.fit(X_train, y_train) print(tpot.score(X_test, y_test)) output_fname = f"pipeline_{datasets[0]['assay']}.py" tpot.export(output_fname) print("## EXPORTED FILE: ##") with open(output_fname, 'r') as fp: print(fp.read()) print("## END EXPORTED FILE ##") print("## PFHXS PREDICTED PROBABILITY:")
train_size=0.75, test_size=0.25, random_state=seed) tpot_config = { 'xgboost.sklearn.XGBClassifier': { 'max_depth': [2, 3, 4], 'learning_rate': [1.0], 'silent': [1.0], 'n_estimators': [5, 10, 15] } } pipeline_optimizer = TPOTClassifier(generations=5, population_size=20, cv=10, random_state=seed, verbosity=3, periodic_checkpoint_folder='checkpoints', config_dict=tpot_config) pipeline_optimizer.fit(X_train, y_train) print(pipeline_optimizer.score(X_test, y_test)) pipeline_optimizer.export('tpot_exported_pipeline.py') ei = pipeline_optimizer.evaluated_individuals_ joblib.dump(ei, 'evaluated_individuals.pkl') tmp = joblib.load('evaluated_individuals.pkl') print "read back from joblib:", tmp
alg = dict_to_obj(algorithm) if not hasattr(alg, 'automl'): alg.automl = True if not hasattr(alg, 'sampling'): alg.sampling = False model = None if alg.is_supervised: # ------------------------------------------------------------- # Classification algorithms # if alg.name == 'TPOT_Classifier': from tpot import TPOTClassifier model = TPOTClassifier( generations=alg.generations, cv=alg.cv, scoring=alg.scoring, verbosity=alg.verbosity ) elif alg.name == 'AutoSklearn_Classifier': from autosklearn import classification if alg.sampling: model = classification.AutoSklearnClassifier( time_left_for_this_task=alg.task_time, per_run_time_limit=alg.run_time, resampling_strategy=alg.sampling_strategy, resampling_strategy_arguments={'folds': alg.folds} ) else: model = classification.AutoSklearnClassifier( time_left_for_this_task=alg.task_time, per_run_time_limit=alg.run_time
""" # dataset preparation from tpot import TPOTClassifier import pandas as pd import numpy as np from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer from sklearn import decomposition, ensemble from sklearn.naive_bayes import MultinomialNB from sklearn import pipeline data = pd.read_csv('../dataset/bbc_articles_labels_all.csv') # split the dataset into training and validation datasets train_x, valid_x, train_y, valid_y = model_selection.train_test_split( data['text'], data['category']) #Label encoding the target variables from sklearn.preprocessing import LabelEncoder categoryLableEncoder = LabelEncoder() train_y = categoryLableEncoder.fit_transform(train_y) valid_y = categoryLableEncoder.transform(valid_y) tfidf_transformer = TfidfVectorizer() X_train_tfidf = tfidf_transformer.fit_transform(train_x) X_train_df = pd.DataFrame(X_train_tfidf.toarray()) tpot_clf = TPOTClassifier(generations=10) tpot_clf.fit(X_train_df, train_y)
from tpot import TPOTClassifier from sklearn.datasets import load_iris from sklearn.model_selection import train_test_split import numpy as np iris = load_iris() X_train, X_test, y_train, y_test = train_test_split( iris.data.astype(np.float64), iris.target.astype(np.float64), train_size=0.75, test_size=0.25) print(type(X_train)) print(type(X_test)) print(type(y_train)) print(type(y_test)) tpot = TPOTClassifier(generations=5, population_size=50, verbosity=2) tpot.fit(X_train, y_train) print(tpot.predict(X_test)) print(tpot.predict_proba(X_test)) print(tpot.score(X_test, y_test))
from tpot import TPOTClassifier from sklearn.datasets import load_digits from sklearn.model_selection import train_test_split import argparse parser = argparse.ArgumentParser() parser.add_argument('-njobs', dest='njobs', type=int, required=True) args = parser.parse_args() digits = load_digits() random_seed = 0 X_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target, train_size=0.75, test_size=0.25, random_state=random_seed) tpot = TPOTClassifier(generations=5, population_size=40, cv=5, n_jobs=args.njobs, random_state=random_seed, verbosity=2, use_dask=False) tpot.fit(X_train, y_train) print(tpot.score(X_test, y_test))
if target_label == "linear_label": print("Running regression") tpot = TPOTRegressor(population_size=population_size, verbosity=2, random_state=random_state, cv=kf, n_jobs=n_jobs, max_time_mins=max_time_mins, max_eval_time_mins=max_eval_time_mins, config_dict=config_dict) tpot.fit(X_train, y_train) else: #label or granular label score_function = "macroF1MinusGreen" if target_label == "label" else "macroF1FromGranular" if tryCrisisFocus: score_function = "macroF1MinusCrisis" tpot = TPOTClassifier(population_size=population_size, verbosity=2, scoring=score_function, random_state=random_state, cv=kf, n_jobs=n_jobs, max_time_mins=max_time_mins, max_eval_time_mins=max_eval_time_mins, config_dict=config_dict, memory='auto', periodic_checkpoint_folder=checkpoint_folder) tpot.fit(X_train, y_train) tpot.export(full_tpot_out_filename)
def test_get_by_name(): """Assert that the Operator class returns operators by name appropriately""" tpot_obj = TPOTClassifier() assert get_by_name("SelectKBest", tpot_obj.operators).__class__ == TPOTSelectKBest.__class__