def test_score(): """Assert that the TPOT score function raises a ValueError when no optimized pipeline exists""" tpot_obj = TPOTClassifier() try: tpot_obj.score(testing_features, testing_classes) assert False # Should be unreachable except ValueError: pass
def test_score_2(): """Assert that the TPOTClassifier score function outputs a known score for a fix pipeline""" tpot_obj = TPOTClassifier() known_score = 0.977777777778 # Assumes use of the TPOT balanced_accuracy function # Reify pipeline with known score pipeline_string= ('KNeighborsClassifier(input_matrix, KNeighborsClassifier__n_neighbors=10, ' 'KNeighborsClassifier__p=1,KNeighborsClassifier__weights=uniform)') tpot_obj._optimized_pipeline = creator.Individual.from_string(pipeline_string, tpot_obj._pset) tpot_obj._fitted_pipeline = tpot_obj._toolbox.compile(expr=tpot_obj._optimized_pipeline) tpot_obj._fitted_pipeline.fit(training_features, training_classes) # Get score from TPOT score = tpot_obj.score(testing_features, testing_classes) # http://stackoverflow.com/questions/5595425/ def isclose(a, b, rel_tol=1e-09, abs_tol=0.0): return abs(a - b) <= max(rel_tol * max(abs(a), abs(b)), abs_tol) assert isclose(known_score, score)
def test_score_2(): """Assert that the TPOTClassifier score function outputs a known score for a fixed pipeline""" tpot_obj = TPOTClassifier() tpot_obj._pbar = tqdm(total=1, disable=True) known_score = 0.986318199045 # Assumes use of the TPOT balanced_accuracy function # Reify pipeline with known score tpot_obj._optimized_pipeline = creator.Individual.\ from_string('RandomForestClassifier(input_matrix)', tpot_obj._pset) tpot_obj._fitted_pipeline = tpot_obj._toolbox.compile(expr=tpot_obj._optimized_pipeline) tpot_obj._fitted_pipeline.fit(training_features, training_classes) # Get score from TPOT score = tpot_obj.score(testing_features, testing_classes) # http://stackoverflow.com/questions/5595425/ def isclose(a, b, rel_tol=1e-09, abs_tol=0.0): return abs(a - b) <= max(rel_tol * max(abs(a), abs(b)), abs_tol) assert isclose(known_score, score)
def tpot(self): from tpot import TPOTClassifier tpot = TPOTClassifier(generations=5, population_size=20, verbosity=2) tpot.fit(self.train_data, self.train_label) print(tpot.score(self.predi_data, self.predi_label))
train.Cabin = train.Cabin[train.Cabin!='T'] # Dropping Unuses Columns train = train.drop(['PassengerId','Name','Ticket','Cabin','Embarked'],axis='columns') test = test.drop(['PassengerId','Name','Ticket','Cabin','Embarked'],axis='columns') print('train_shape={},test_shape={}'.format(train.shape,test.shape)) # Doing and train validation split y = train.pop('Survived') X = train train_X,validation_X,train_y,validation_y = train_test_split(X,y,test_size=0.3,random_state=42) # Fitting a TPOT classification model # Change max_time_mins for the amount of time you want to train tpot = TPOTClassifier(verbosity = 2,max_time_mins=1) tpot.fit(train_X,train_y) print(tpot.score(validation_X,validation_y)) print(tpot.fitted_pipeline_) tpot.export('tpot_titanic.py')
import numpy as np import pandas as pd from sklearn import preprocessing from sklearn.metrics import mean_squared_error from tpot import TPOTClassifier from sklearn.model_selection import train_test_split import xgboost as xgb dfee0 = pd.read_csv("dfee0new.csv") df = dfee0 dfee0['status'] = dfee0['status'].astype(int) target = 'status' IDcol = 'uid' predictors = [x for x in df.columns if x not in [target, IDcol]] X_train, X_test, y_train, y_test = train_test_split(dfee0[predictors], dfee0[target], train_size = 0.75, test_size = 0.25) tpot = TPOTClassifier(generations = 100, population_size = 100, verbosity = 2, n_jobs = 8) tpot.fit(X_train, y_train) print(tpot.score(X_test, y_test)) tpot.export('tpot_report_pipeline.py')
# get train and test data X_train, X_test, y_train, y_test = train_test_split(alldata, labels, train_size=0.750, test_size=0.250) if mtype in [' classification', 'c']: tpot = TPOTClassifier(generations=5, population_size=50, verbosity=2, n_jobs=-1) tpotname = '%s_tpotclassifier.py' % (jsonfile[0:-5]) elif mtype in ['regression', 'r']: tpot = TPOTRegressor(generations=5, population_size=20, verbosity=2) tpotname = '%s_tpotregression.py' % (jsonfile[0:-5]) tpot.fit(X_train, y_train) accuracy = tpot.score(X_test, y_test) tpot.export(tpotname) # export data to .json format data = { 'data': alldata.tolist(), 'labels': labels.tolist(), } jsonfilename = '%s_.json' % (tpotname[0:-3]) jsonfile = open(jsonfilename, 'w') json.dump(data, jsonfile) jsonfile.close() # now edit the file and run it g = open(tpotname).read()
y_train = train['response'] y_test = test['response'] X_train = train.drop('response', axis=1).copy() X_test = test.drop('response', axis=1).copy() tpot = TPOTClassifier(verbosity=3, scoring="roc_auc", random_state=23, n_jobs=-1, generations=5, population_size=10) times = [] scores = [] winning_pipes = [] # run three iterations and time them for x in range(3): start_time = timeit.default_timer() tpot.fit(X_train, y_train) elapsed = timeit.default_timer() - start_time times.append(elapsed) winning_pipes.append(tpot.fitted_pipeline_) scores.append(tpot.score(X_test, y_test)) tpot.export('tpot_h2odata_pipeline.py') times = [time / 60 for time in times] print('Times:', times) print('Scores:', scores) print('Winning pipelines:', winning_pipes)
def cyc(): n = 5 s = 0 cl = "class" ans = "" df = pd.DataFrame(pd.read_csv('colData.csv')) varArr = [ "protons", "nuetrons", "electrons", "ProtElectConfigNumb", "NueConfigNumb", "periodic x", "periodic y" ] for i in range(s, 7): ans += '[' + str(i) + '],' print('col' + str(i) + '.csv') print("data framed") if i != 7: df.drop([varArr[i]], 1, inplace=True) df.drop(['element'], 1, inplace=True) df.drop(['ProtElectConfig'], 1, inplace=True) df.drop(['NueConfig'], 1, inplace=True) if i == 7: df.drop(['ProtElectConfigNumb'], 1, inplace=True) df.drop(['NueConfigNumb'], 1, inplace=True) if i == 8: df.drop(['magicNue'], 1, inplace=True) df.drop(['magicPro'], 1, inplace=True) #itertools.combinations(iterable, r)? #df.drop(['half'],1, inplace=True) #df.drop(['magicNue'],1, inplace=True) #df.drop(['magicPro'],1, inplace=True) #df.replace(NaN) #print(df) X = np.array(df.drop([cl], 1)) y = np.array(df[cl]) print("here") #print(csvInate) #X =[[1,0,1,1,1,1,1,1],[2,2,2,2,2,2,18,1],[3,4,3,11,12,11,1,2],[4,5,4,12,121,12,2,2],[5,6,5,121,122,121,13,2],[6,6,6,222,222,222,14,2],[7,7,7,223,223,223,15,2],[8,8,8,224,224,224,16,2],[9,10,9,225,226,225,17,2],[37,48,37,41,4210,41,1,5],[115,174,115,6214103,6214103218141060,6214103,15,7],[99,153,99,6211,621410321811,6211,14,7],[73,108,73,62143,62146,62143,5,6],[25,30,25,325,3210,325,7,4],[117,175,117,6214105,6214103218141060,6214105,17,7],[86,136,86,6214106,6214103212,6214106,18,6],[36,48,36,42106,4210,42106,18,3],[30,35,30,4210,42105,4210,12,4],[112,173,112,621410,6214103218141060,621410,12,7],[110,171,110,62148,62141031814105,62148,10,7],[6,6,6,122,122,122,14,2],[14,14,14,222,222,222,14,3],[83,126,83,214103,52146,214103,15,6],[75,111,75,52145,62149,52145,7,6],[11,22,11,21,322,21,1,3],[118,176,118,6214106,6214103218141060,6214106,18,7],[24,28,24,324,328,324,6,4],[48,66,48,5210,5210,5210,12,5],[1,0,1,1,1,1,1,1],[2,2,2,2,2,2,18,1],[3,4,3,11,12,11,1,2],[4,5,4,12,121,12,2,2],[5,6,5,121,122,121,13,2],[6,6,6,222,222,222,14,2],[7,7,7,223,223,223,15,2],[8,8,8,224,224,224,16,2],[9,10,9,225,226,225,17,2],[37,48,37,41,4210,41,1,5],[115,174,115,6214103,6214103218141060,6214103,15,7],[99,153,99,6211,621410321811,6211,14,7],[73,108,73,62143,62146,62143,5,6],[25,30,25,325,3210,325,7,4],[117,175,117,6214105,6214103218141060,6214105,17,7],[86,136,86,6214106,6214103212,6214106,18,6],[36,48,36,42106,4210,42106,18,3],[30,35,30,4210,42105,4210,12,4],[112,173,112,621410,6214103218141060,621410,12,7],[110,171,110,62148,62141031814105,62148,10,7],[6,6,6,122,122,122,14,2],[14,14,14,222,222,222,14,3],[83,126,83,214103,52146,214103,15,6],[75,111,75,52145,62149,52145,7,6],[11,22,11,21,322,21,1,3],[118,176,118,6214106,6214103218141060,6214106,18,7],[24,28,24,324,328,324,6,4],[48,66,48,5210,5210,5210,12,5]] X_train, X_test, y_train, y_test = cross_validation.train_test_split( X, y, test_size=0.3) #clf = neighbors.KNeighborsClassifier() #clf=ensemble.RandomForestClassifier() clf = TPOTClassifier(generations=2, population_size=100, verbosity=2, scoring="accuracy") #clf = ElasticNet() #clf = MLPClassifier() clf.fit(X_train, y_train) accuracy = clf.score(X_test, y_test) print("acc"+i\ ) print(accuracy) #clf.export('col2'+str(i)+'.py') print("EXPORTED") ans += str(accuracy) + ',' print(ans)
# Date : 2020-03-01 # 使用TPOT自动机器学习工具对MNIST进行分类 from tpot import TPOTClassifier from sklearn.datasets import load_digits from sklearn.model_selection import train_test_split from action import * import numpy as np # 加载数据 train_data, test_data = load_data() train_data, test_data = data_fillna(train_data, test_data) train_data, test_data = data_process(train_data, test_data) features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked', 'My'] train_data['My'] = train_data['Age'] + train_data['Sex'] test_data['My'] = test_data['Age'] + test_data['Sex'] train_labels = train_data['Survived'] train_features = train_data[features] train_x, train_y, label_x, label_y = train_test_split(train_features, train_labels, test_size=0.3, random_state=1) test_features = test_data[features] tpot = TPOTClassifier(generations=5, population_size=20, verbosity=2) tpot.fit(train_x, label_x) print(tpot.score(train_y, label_y)) tpot.export('tpot_mnist_pipeline.py') # output # Best pipeline: RandomForestClassifier(input_matrix, bootstrap=True, criterion=entropy, max_features=0.7000000000000001, min_samples_leaf=6, min_samples_split=9, n_estimators=100) # 0.7761194029850746
qb = df[(df.Position == 'QB') & (df.AvgPts >= 12)] rb = df[(df.Position == 'RB') & (df.AvgPts >= 8)] wr = df[(df.Position == 'WR') & (df.AvgPts >= 8)] te = df[(df.Position == 'TE') & (df.AvgPts >= 6)] # need to remove nulls, but x3 is not null, so remove, then split off X = qb[qb_features + ['x3']] X = X[-X.isnull().any(axis=1)] y = X['x3'] X.drop('x3', axis=1) print('start qb model\n\n') X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.75, test_size=0.25) tpot = TPOTClassifier(generations=5, population_size=20, verbosity=2) tpot.fit(X_train, y_train) print('qb model: {}'.format(tpot.score(X_test, y_test))) print('\n\n') X = rb[rb_features] y = rb['x3'] print('start rb model\n\n') X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.75, test_size=0.25) tpot = TPOTClassifier(generations=5, population_size=20, verbosity=2) tpot.fit(X_train, y_train) print('rb model: {}'.format(tpot.score(X_test, y_test))) X = wr[wr_features] y = wr['x3'] print('start wr model\n\n')
# Note: After instantiating the Client you can open # http://localhost:8787/status to see the dashboard of workers # To see the dashboard bokeh needs to be installed on your enviroment from sklearn.externals import joblib import distributed.joblib from dask.distributed import Client client = Client(diagnostics_port=8788, processes=False) client # Create Data digits = load_digits() # To ensure the example runs quickly, we'll make the training dataset relatively # small. X_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target, train_size=0.75, test_size=0.25) # Using Dask # scale up: Increase the TPOT parameters like population_size, generations. # Note: When use_dask = True, TPOT will use as manu cores as avaliable on the cluster, regardless of the n_jobs specified tp = TPOTClassifier(generations=5, population_size=40, cv=5, random_state=0, verbosity=2, use_dask=True) with joblib.parallel_backend('dask'): tp.fit(X_train, y_train) print(tp.score(X_test, y_test))
pipeline_optimizer = TPOTClassifier(generations = 100, warm_start = True, verbosity=2, max_time_mins=60, early_stop = 5) #f= open("Test_scores.txt","a+") #Initialization X = B[0].iloc[:,0:-1] y = B[0].iloc[:,-1] start = time.time() pipeline_optimizer.fit(X, y) for i in range(1,n): X = B[i].iloc[:,0:-1] y = B[i].iloc[:,-1] accuracy = pipeline_optimizer.score(X, y) end = time.time() # f.write("Test batch %d - Test score %f - Duration %f\n" % (i, accuracy, end-start)) print("Test batch %d - Test score %f - Duration %f\n" % (i, accuracy, end-start)) #file = 'tpot_exported_pipelinefor' + str(i) + '.py' #pipeline_optimizer.export(file) start = time. time() pipeline_optimizer.fit(X, y) f.close() # In[ ]:
import numpy as np import pandas as pd from tpot import TPOTClassifier from sklearn import preprocessing, model_selection, svm, neighbors, linear_model, discriminant_analysis, naive_bayes, \ tree df = pd.read_csv('forestfires.csv') df = df.replace({'month': {'jan': 1, 'feb': 2, 'mar': 3, 'apr': 4, 'may': 5, 'jun': 6, 'jul': 7, 'aug': 8, 'sep': 9, 'oct': 10, 'nov': 11, 'dec': 12}, 'day': {'sun': 1, 'mon': 2, 'tue': 3, 'wed': 4, 'thu': 5, 'fri': 6, 'sat': 7}}) X = np.array(df.drop(['area', 'month', 'day', 'X', 'Y'], 1)) X = preprocessing.scale(X) y = np.array(df['area']) y = np.heaviside(y, 0) X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.2) pipeline_optimizer = TPOTClassifier(generations=5, population_size=20, cv=5, random_state=42, verbosity=2) pipeline_optimizer.fit(X_train, y_train) accuracy = pipeline_optimizer.score(X_test, y_test) print('tpot:', accuracy)
y_train, uniques = pd.factorize(y_train, sort=False) starttime = datetime.now() # Configure tpotConf = f"# TPOTClassifier(verbosity=2,generations={generations},config_dict={config_dict},max_time_mins={max_time_mins},random_state={random_state},early_stop={early_stop})" print(tpotConf) Tpot = TPOTClassifier(verbosity=2, generations=generations, config_dict=config_dict, max_time_mins=max_time_mins, random_state=random_state, early_stop=early_stop) Tpot_file = ifile.replace( ".csv", "_" + str(generations) + "g_" + model + "_Tpot.py") print(f"Fitting to file {Tpot_file}") # Start Tpot.fit(X_train, y_train) Tpot_score = Tpot.score(X_test, y_test) Tpot.export(Tpot_file) print(f"Score see {Tpot_file}" ) # The correct score is in the tpot export file endtime = datetime.now() duration = mytreat.dur(starttime, endtime) mycomments = tpotConf mycomments = mycomments + f'# ifile {ifile} model={argv.model} Tpot_file={Tpot_file}, Tpot_score={Tpot_score}, starttime{starttime}, endtime={endtime} duration={duration}\n' myutils = mmutils.mmutils() myutils.prependComments(Tpot_file, mycomments)
mlb = MultiLabelBinarizer() job_Trans = mlb.fit_transform([{str(val)} for val in data['job'].values]) education_Trans = mlb.fit_transform([{str(val)} for val in data['education'].values]) month_Trans = mlb.fit_transform([{str(val)} for val in data['month'].values]) data_new = data.drop([ 'marital', 'default', 'housing', 'loan', 'contact', 'poutcome', 'class', 'job', 'education', 'month' ], axis=1) data_new = np.hstack( (data_new.values, job_Trans, education_Trans, month_Trans)) data_class = data['class'].values training_indices, validation_indices = training_indices, testing_indices = train_test_split( data.index, stratify=data_class, train_size=0.75, test_size=0.25) tpot = TPOTClassifier(population_size=15, max_eval_time_mins=0.04, max_time_mins=2, verbosity=3, n_jobs=-1) tpot.fit(data_new[training_indices], data_class[training_indices]) score = tpot.score(data_new[validation_indices], data.loc[validation_indices, 'class'].values) print(score)
merged = merged.iloc[:, 2:] yy = np.array(ydata) datac = np.mat(merged) #datac = np.concatenate((merged, yy),axis =1) random.shuffle(datac) xc_t = datac[:, :-1][:1200, ] xc_v = datac[:, :-1][1201:, ] yt = [x[0] for x in datac[:, -1][:1200, ].astype(np.int32).tolist()] yv = [x[0] for x in datac[:, -1][1201:, ].astype(np.int32).tolist()] pipeline_optimizer.fit(xc_t, yt) print(pipeline_optimizer.score(xc_v, yv)) pipeline_optimizer.export('./tpot_exported_pipeline.py') from sklearn.ensemble import GradientBoostingClassifier from sklearn.model_selection import train_test_split from sklearn.pipeline import make_pipeline, make_union from tpot.builtins import StackingEstimator from sklearn.preprocessing import FunctionTransformer from copy import copy exported_pipeline = make_pipeline( make_union(FunctionTransformer(copy), FunctionTransformer(copy)), GradientBoostingClassifier(learning_rate=0.5, max_depth=6, max_features=0.9000000000000001,
#data,label = data[idx_row,:],label[idx_row] #features = data #tpot_data=pd.DataFrame({'class':label},columns=['class']) #training_features, testing_features, training_classes, testing_classes = \ # train_test_split(features, tpot_data['class'], random_state=42) data,label,idx_row = np.concatenate(samples),np.concatenate(label),np.arange(0,len(label),1) print('shuffle') for ii in range(100): shuffle(idx_row) data,label = data[idx_row,:],label[idx_row] X_train, X_test, y_train, y_test = train_test_split(data,label,train_size=0.80) print('model selection') tpot = TPOTClassifier(generations=10, population_size=25, verbosity=2,random_state=373849,num_cv_folds=5,scoring='roc_auc' ) tpot.fit(X_train,y_train) tpot.score(X_test,y_test) tpot.export('%s%s_tpot_exported_pipeline.py'%(folder,type_) ) print('finished model selection') """ from sklearn.ensemble import VotingClassifier from sklearn.feature_selection import SelectFwe, f_classif from sklearn.model_selection import train_test_split from sklearn.neighbors import KNeighborsClassifier from sklearn.pipeline import make_pipeline, make_union from sklearn.preprocessing import FunctionTransformer from sklearn.model_selection import KFold exported_pipeline = make_pipeline( make_union( FunctionTransformer(lambda X: X), FunctionTransformer(lambda X: X) ),
# if y_test_string[i]=='train': # y_test[i]=5 # if y_test_string[i]=='zebra': # y_test[i]=6 # # # """ -------------- TPOT does is magic------------------------------------- """ from tpot import TPOTClassifier clf = TPOTClassifier(verbosity=2, n_jobs=1) clf.fit(X_train, y_train) print('test score=', clf.score(X_test, y_test)) predictions = clf.predict(X_test) print(confusion_matrix(y_test, predictions)) os.chdir( 'C:/Users/Bruger/Documents/Uni/Advanche machine learning/Projekt/Code/thor_final_scripts_for_report' ) clf.export('TPOT_EXPORT_autoML_newData_CHANNELSREMOVED_95PCA_Binary.py') error_rate = clf.score(X_test, y_test) number_observations = len(X_test) print('uncertanty=', np.sqrt((error_rate * (1 - error_rate)) / (number_observations))) #def f(error_rate,number_observations): # return np.sqrt((error_rate*(1-error_rate))/(number_observations))
def main(): # set up the path to the data sets and the data were are going to experiment # with base_path = '/scratch/ditzler/Git/ClassificationDatasets/csv/' data_setz = [#'bank', 'blood', 'breast-cancer-wisc-diag', 'breast-cancer-wisc-prog', 'breast-cancer-wisc', 'breast-cancer', 'congressional-voting', 'conn-bench-sonar-mines-rocks', 'credit-approval', 'cylinder-bands', 'echocardiogram', #'fertility', 'haberman-survival', 'heart-hungarian', 'hepatitis', 'ionosphere', 'mammographic', 'molec-biol-promoter', 'musk-1', 'oocytes_merluccius_nucleus_4d', 'oocytes_trisopterus_nucleus_2f', 'ozone', 'parkinsons', 'pima', #'pittsburg-bridges-T-OR-D'; 'planning', 'ringnorm', #'spambase', 'spectf_train', 'statlog-australian-credit', 'statlog-german-credit', 'statlog-heart', 'titanic', #'twonorm', 'vertebral-column-2clases'] # nsplits is like the number of cv (its bootstraps here) then set up some variales # to save the results to. n_splitz = 10 errors = np.zeros((len(data_setz),)) fms = np.zeros((len(data_setz),)) times = np.zeros((len(data_setz),)) m = 0 for n in range(n_splitz): print 'Spilt ' + str(n) + ' of ' + str(n_splitz) for i in range(len(data_setz)): print ' ' + data_setz[i] df = pd.read_csv(base_path + data_setz[i] + '.csv', sep=',') data = df.as_matrix() X = data[:, :-1] y = data[:, -1] X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.75, test_size=0.25, random_state=m) m += 1 ts = time.time() tpot = TPOTClassifier(generations=10, population_size=25, verbosity=1) tpot.fit(X_train, y_train) times[i] += (time.time() - ts) errors[i] += (1-tpot.score(X_test, y_test)) yhat = tpot.predict(X_test) fms[i] += f1_score(y_test, yhat, average='macro') errors /= n_splitz fms /= n_splitz times /= n_splitz df = pd.DataFrame({'errors': errors, 'fms': fms, 'times': times}) df.to_csv(path_or_buf='tpot-results2.csv', sep=',') return None
scaler = StandardScaler() X_scaled = scaler.fit_transform(X) X_scaled.shape clf = LogisticRegression() svmclf = SVC(kernel='rbf') rfclf = RandomForestClassifier() tpotclf = TPOTClassifier() model1 = clf.fit(X_train, y_train) model2 = svmclf.fit(X_train, y_train) model3 = rfclf.fit(X_train, y_train) model_auto_clf = tpotclf.fit(X_train) score = cross_val_score(clf, X_train, y_train) score2 = cross_val_score(svmclf, X_train, y_train) score3 = cross_val_score(rfclf, X_train, y_train) print("score is:%.2f\n,", score3) ##Tpot classifier tpotclf = TPOTClassifier(generations=5, cv=5) model_tpot_clf = tpotclf.fit(X_train, y_train) score = tpotclf.score(X_test, y_test) print(score) tpotclf.export('classifier-pipeline.py') #predict for test y_pred = clf.predict(X_test) y1_pred = svmclf.predict(X_test) y2_pred = rfclf.predict(X_test) score_1 = accuracy_score(y_test, y_pred) score_2 = accuracy_score(y_test, y1_pred) score_3 = accuracy_score(y_test, y2_pred) print("score is:", score_1, score_2, score_3)
# Run auto-sklearn framework on the dataset auto_score = -1 try: automl = autosklearn.classification.AutoSklearnClassifier(time_left_for_this_task=timeout_in_sec, n_jobs=n_jobs) automl.fit(X_train, y_train, feat_type=feat_type) #automl.fit_ensemble(y_train, ensemble_size=50) y_hat = automl.predict(X_test.values) auto_score = accuracy_score(y_test, y_hat) except: print(sys.exc_info()[0]) # Run TPOT framework on the dataset tpot_score = -1 try: tpot = TPOTClassifier(verbosity=0, n_jobs=n_jobs, random_state=1, max_time_mins=timeout, max_eval_time_mins=0.04, population_size=15) tpot.fit(X_train, y_train) tpot_score = tpot.score(X_test, y_test) except: print(sys.exc_info()[0]) # Run Lite-AutoML framework on the dataset (best, atts, cl) = liteautoml.compute_score(X_train, y_train, X_test, y_test, cat_indicator, n_jobs, timeout_in_sec) # Run hyperopt-sklearn on the dataset hp_best = evaluate_hyperopt.compute_score(X_train, y_train, X_test, y_test, cat_indicator, n_jobs, timeout_in_sec) # Write results and dataset details to file outfile.write(dataset.name + "," + str(dummy) + "," + cl + "," + str(id) + "," + str(rows) + "," + str(classes) + "," + str(auto_score) + "," + str(tpot_score) + "," + str(hp_best) + "," + str(best) + "," + str(len(X.columns)) + "," + str(atts) +'\n') outfile.close()
from tpot import TPOTClassifier from tools import prepare_dataset y, x = prepare_dataset() x_train = x[:614] y_train = y[:614].reshape(-1, ) x_valid = x[614:] y_valid = y[614:].reshape(-1, ) pipeline_optimizer = TPOTClassifier(generations=50, population_size=20, cv=5, random_state=42, verbosity=2) pipeline_optimizer.fit(x_train, y_train) print(pipeline_optimizer.score(x_train, y_train)) pipeline_optimizer.export('tpot_exported_pipeline.py')
# 决策树训练 clf.fit(train_features, train_labels) test_features = dvec.transform(test_features.to_dict(orient='record')) # 决策树预测 pred_labels = clf.predict(test_features) # 得到决策树准确率 #这里用训练集计算准确率不甚合理,要考虑到过拟合的情形,不过只是过一下流程,倒也不必太在意 acc_decision_tree = round(clf.score(train_features, train_labels), 6) print("ID3 score 准确率为 %.4lf%%" % (acc_decision_tree * 100)) cls = DecisionTreeClassifier() cls.fit(train_features, train_labels) pred_labels = cls.predict(test_features) acc_cart_tree = round(cls.score(train_features, train_labels), 6) print("CART score 准确率为 %.4lf%%" % (acc_cart_tree * 100)) xg = XGBClassifier() xg.fit(train_features, train_labels) pred_labels = xg.predict(test_features) acc_xgboost = round(xg.score(train_features, train_labels), 6) print("XGboost score 准确率为 %.4lf%%" % (acc_xgboost * 100)) tpotcls = TPOTClassifier(generations=5, population_size=20, verbosity=2) tpotcls.fit(train_features, train_labels) pred_labels = tpotcls.predict(test_features) acc_tpot = round(tpotcls.score(train_features, train_labels), 6) print("TPOT score 准确率为 %.4lf%%" % (acc_tpot * 100)) tpotcls.export('tpot_titanic_pipeline.py')
X = df.drop('event', axis=1) y = df.event # Encode y like this np.sort(y.unique()) y = y.astype('category').cat.codes # %% X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.75, random_state=42) # %% # PCA # Scale data first scale = StandardScaler() X_train_scaled = scale.fit_transform(X_train) X_test_scaled = scale.transform(X_test) pca = PCA(n_components=30) X_train_pca = pca.fit_transform(X_train_scaled) X_test_pca = pca.transform(X_test_scaled) # %% tpot = TPOTClassifier(verbosity=2, random_state=42) tpot.fit(X_train_pca, y_train) print(tpot.score(X_test_pca, y_test)) tpot.export('tpot_project_pipeline.py')
from tpot import TPOTClassifier from sklearn.datasets import load_digits from sklearn.model_selection import train_test_split as split digits = load_digits() X_train, X_test, y_train, y_test = split(digits.data, digits.target) pipeline_optimizer = TPOTClassifier(generations=5, population_size=20, cv=5) pipeline_optimizer.fit(X_train, y_train) print(pipeline_optimizer.score(X_test, y_test)) pipeline_optimizer.export('../models/tpot_exported_pipeline.py')
def genetic_algorithm(X_train, X_test, y_train, y_test): from tpot import TPOTClassifier tpot = TPOTClassifier(generations=100, population_size=20, verbosity=2) tpot.fit(X_train, y_train) print(tpot.score(X_test, y_test))
#Numpy for converting the column vector into an array import numpy as np #used in jupyter notebook for getting the data from directory import os #reading the data in csv format train = pd.read_csv(r"california_housing_train.csv") test = pd.read_csv(r"california_housing_test.csv") #Splitting the dataset into X,Y train and test Y_train = train[["median_house_value"]] X_train = train.drop("median_house_value",axis=1,inplace=False) Y_test = test[["median_house_value"]] X_test = test.drop("median_house_value",axis=1,inplace=False) #Creating a object of TpotClassifier which runs genetic algo for 5 generations # and then stops tpot = TPOTClassifier(generations=5,verbosity=2) #Creating the model and printing the score tpot.fit(X_train,np.ravel(Y_train)) print(tpot.score(X_test,np.ravel(Y_test)))
random_state=1776, warm_start=True) my_tpot.fit(x_train, y_train) # DETERMINE BEST CV SCORE PIPELINE ---------------------------------------------- best_pipes = my_tpot.pareto_front_fitted_pipelines_ len_best_pipes = len(best_pipes) best_pipe_key = list( best_pipes.keys())[(len_best_pipes - 1)] # key is entire pipeline as string best_cv = abs(my_tpot.evaluated_individuals_[best_pipe_key][1]) # HOLDOUT SCORE -------------------------------------------------------------- holdout_score = my_tpot.score(x_test, y_test) print(holdout_score) print(ite) # row_id print(best_pipe_key) # best_pipe print(best_cv) # best_cv print(holdout_score) # holdout_score print(this_scoring_method) # scoring_method print(xt_nrows) # xt_rows print(xt_numb_feats) # xt_numb_feats # replace commas in best pipeline with dashes (this is the only field with risk of commas best_pipe_key_no_comma = best_pipe_key.replace(",", "-") # generate content line regardless of if the file exists already or not content_line = str.format("{0}, {1}, {2}, {3}, {4}, {5}, {6}\n", ite,
import numpy as np # Load the data telescope = pd.read_csv('../data/magic04.data.csv') # Clean the data telescope_shuffle = telescope.iloc[np.random.permutation(len(telescope))] tele = telescope_shuffle.reset_index(drop=True) # Store 2 classes tele['Class'] = tele['Class'].map({'g': 0, 'h': 1}) tele_class = tele['Class'].values # Split training, testing, and validation data training_indices, validation_indices = training_indices, testing_indices = train_test_split( tele.index, stratify=tele_class, train_size=0.75, test_size=0.25) # Let Genetic Programming find best ML model and hyperparameters tpot = TPOTClassifier(generations=5, verbosity=2) tpot.fit( tele.drop('Class', axis=1).loc[training_indices].values, tele.loc[training_indices, 'Class'].values) # Score the accuracy tpot.score( tele.drop('Class', axis=1).loc[validation_indices].values, tele.loc[validation_indices, 'Class'].values) # Export the generated code tpot.export(pipeline.py)
print(sum(y_test==1)) print(len(y_test)) print('percentage of not animals=',(sum(y_test==1)-len(y_test))/len(y_test)) from tpot import TPOTClassifier clf=TPOTClassifier(verbosity=2,n_jobs=-1) clf.fit(X_train,y_train) print(sum(y_test==1)) print(len(y_test))' print('percentage of not animals=',(sum(y_test==1)-len(y_test))/len(y_test)) print('test score=',clf.score(X_test,y_test)) predictions = clf.predict(X_test) print(confusion_matrix(y_test,predictions)) #digits=load_digits() #X=digits['data'] #y=digits['target'] # #X_train,X_test,y_train,y_test=train_test_split(X,y,stratify=y) # #from sklearn.linear_model import LogisticRegression #clf = LogisticRegression() #clf.fit(X_train,y_train) # #result=clf.score(X_test,y_test)
data = pd.read_excel(path, header=1, index_col=0) data = data.rename(columns={'default payment next month': "default"}) print2(data.head()) X_train, X_test, y_train, y_test = train_test_split(data.iloc[:, :-1], data.iloc[:, -1], stratify=data.iloc[:, -1], test_size=0.3) print2(X_train.shape, X_test.shape, y_train.shape, y_test.shape) # Assign the values outlined to the inputs number_generations = 3 population_size = 5 offspring_size = 10 scoring_function = "accuracy" # Create the tpot classifier tpot_clf = TPOTClassifier(generations=number_generations, population_size=population_size, offspring_size=offspring_size, scoring=scoring_function, verbosity=2, random_state=2, cv=2) # Fit the classifier to the training data tpot_clf.fit(X_train, y_train) # Score on the test set print(tpot_clf.score(X_test, y_test))
'sklearn.feature_selection.SelectFromModel': { 'threshold': np.arange(0, 1.01, 0.05), 'estimator': { 'sklearn.ensemble.ExtraTreesClassifier': { 'n_estimators': [100], 'criterion': ['gini', 'entropy'], 'max_features': np.arange(0.05, 1.01, 0.05) } } } } # generations 确定子代的迭代次数 # population_size=10 是创建个体的初始数量 # offspring_size 每一代所需创造个体数 # crossover_rate 用于创造后代的个体所占的百分比 # mutation_rate 属性值随机更改的概率 # 基于遗传算法的一个东西 tpot = TPOTClassifier(generations=1, population_size=10, verbosity=2, config_dict=tpot_config) tpot.fit(X_train, y_train) tpot.score(X_test, y_test) tpot.export('/Users/sheng/PycharmProjects/untitled/guowei/chishi.py') #tpot.score() # tpot.export(result.py) 导出标准的scikit-learn代码
from tpot import TPOTClassifier from sklearn.datasets import load_digits from sklearn.model_selection import train_test_split digits = load_digits() X_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target, train_size = 0.75, test_size = 0.25) tpot = TPOTClassifier(generations = 5, population_size = 20, verbosity = 2) tpot.fit(X_train, y_train) print(tpot.score(X_test, y_test)) tpot.export('tpot_mnist_pipeline.py')
Xtesttokeep = Xtest ytesttokeep = ytest Xtraintokeep = Xtrain ytraintokeep = ytrain acc[i - 1] = success_rate n_of_est[i - 1] = i * 20 print("\n\n BEST ACCURACY SIZE ACCORDING TO VALIDATOR ", est, "AND IDEAL MAX_DEPTH=", bestdepth) clf = RandomForestClassifier(n_estimators=est, max_depth=bestdepth) clf.fit(Xtraintokeep, ytraintokeep) y_pred = clf.predict(Xtesttokeep) ytesttokeep = ytesttokeep.values s = 0 for j in range(0, len(y_pred)): if (ytesttokeep[j] == y_pred[j]): s = s + 1 success_rate2 = s / len(y_pred) * 100 print("\n\n ACCURACY (test data) WITH ", est, " estimators and max_depth=", bestdepth, " (chosen from validator) is ", success_rate2, "%") #plt.plot(acc, n_of_est) #plt.suptitle('No of estimators VS %accuracy (validator data)') #plt.show() tpot = TPOTClassifier(generations=5, population_size=10, verbosity=2) tpot.fit(Xtraintokeep, ytraintokeep) print(tpot.score(Xtesttokeep, ytesttokeep))