def main(args): if args.analyse != None: train_data_x, test_data_x,train_data_y, test_data_y = process_data(args.analyse) RT = RandomForestClassifier(n_estimators=100) RT.fit(train_data_x, train_data_y) print RT.score(test_data_x, test_data_y) return
class Model(BaseModel): """Antares implementation of scikit learn random forest classifier """ def __init__(self, categorical_features=None, n_estimators=50, n_jobs=-1, max_depth=10): ''' Example: >>> from madmex.modeling.supervised.rf import Model >>> rf = Model() >>> # Write model to db >>> rf.to_db(name='test_model', recipe='mexmad', training_set='no') >>> # Read model from db >>> rf2 = Model.from_db('test_model') ''' super().__init__(categorical_features=categorical_features) self.model = RandomForestClassifier(n_estimators=n_estimators, n_jobs=n_jobs, max_depth=max_depth) self.model_name = 'rf' def fit(self, X, y): X = self.hot_encode_training(X) self.model.fit(X, y) def predict(self, X): ''' Simply passes down the prediction from the underlying model. ''' X = self.hot_encode_predict(X) return self.model.predict(X) def predict_confidence(self, X): """Get confidence of every prediction """ X = self.hot_encode_predict(X) return self.model.predict_proba(X).max(axis=1) def score(self, X, y): ''' Test the model given a dataset and a target vector. This method applies the model that this object represents to the given dataset using the response variable y. It is a measure of the accuracy of the trained model. Usually the orginal dataset should be splitted in training and testing subsets to cross validate the model. ''' return self.model.score(X, y)
def decision_frist(): data = datasets.load_iris() x = data["data"] y = data["target"] X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.25) des = DecisionTreeClassifier(max_leaf_nodes=3) des.fit(X_train, y_train) print(des.predict(X_test)) print(des.score(X_test, y_test)) rom = RandomForestClassifier() rom.fit(X_train, y_train) print(rom.predict(X_test)) print(rom.score(X_test, y_test))
def getSentiment(self, path): dirs = os.listdir(path) for file in dirs: filename = path + file print(filename) df = pd.read_csv(filename, header=None) data_original = df.as_matrix() data = data_original[:, 0:len(data_original[0]) - 1] score = data_original[:, len(data_original[0]) - 1] vectorizer = TfidfVectorizer(min_df=5, max_df=0.8, sublinear_tf=True, use_idf=True, decode_error='ignore') train_data = [] train_labels = [] test_data = [] test_labels = [] totalLen = len(data) train = int(totalLen * 0.7) test = totalLen - train train_data = data[0:train, :] train_labels = score[0:train] print(len(train_data)) print(len(train_labels)) test_data = data[train + 1:totalLen, :] test_labels = score[train + 1:totalLen] print(len(test_data)) print(len(test_labels)) classifier_rbf = svm.SVC() x = list() x = train_data[:] classifier_rbf.fit(x, train_labels) accuracy = classifier_rbf.score(test_data, test_labels) print(accuracy) classifier_rmf = RandomForestClassifier(n_estimators=1500) classifier_rmf = classifier_rmf.fit(x, train_labels) accuracy = classifier_rmf.score(test_data, test_labels) print(accuracy)
def rfc(self): N = [10, 50, 100, 1000, 10**4] self.bestScoreN = 0 bestScore = 0 for arg in N: score = 0 for train_ind, test_ind in self.skf.split(self.X,self.Y): self.X_train = [self.X[ind] for ind in train_ind] self.Y_train = [self.Y[ind] for ind in train_ind] self.X_test = [self.X[ind] for ind in test_ind] self.Y_test = [self.Y[ind] for ind in test_ind] clf = RandomForestClassifier(n_estimators = arg).fit(self.X_train,self.Y_train) score = score + clf.score(self.X_test, self.Y_test ) if score > bestScore : bestScore = score self.bestScoreN = arg print ("Mean score for Paramter(",arg,")",(score/float(5))*100,"%")
def models(type='logreg', X=None, y=None, Xtest=None, ytest=None): # Logistic Regression ----- if type == 'logreg': logreg = LogisticRegression(solver='lbfgs', max_iter=1000) logreg.fit(X, y) score_logit = logreg.score(Xtest, ytest) return(score_logit) # CART ----- if type == 'cart': cart_tree = tree.DecisionTreeClassifier(random_state=100) cart_tree.fit(X, y) score_cart = cart_tree.score(Xtest, ytest) return(score_cart) # Random Forest ----- if type =='rf': forest = RandomForestClassifier(n_estimators = 100, max_features='auto', random_state=100) forest.fit(X,y) score_forest = forest.score(Xtest, ytest) return(score_forest) if type =='xgboost': xgbooster = XGBClassifier(n_estimators=100, max_depth=4,random_state=100) xgbooster.fit(X, y) score_xgboost = xgbooster.score(Xtest, ytest) return(score_xgboost) if type =='nn': nnet = MLPClassifier(solver='adam', hidden_layer_sizes=(5,5), max_iter = 500, early_stopping = True, random_state=100) nnet.fit(X, y) score_nnet = nnet.score(Xtest, ytest) return(score_nnet)
class Model(BaseModel): ''' classdocs ''' def __init__(self, path): ''' Constructor ''' self.path = path self.model = RandomForestClassifier(n_estimators=150,n_jobs=8) self.model_name = 'rf' def fit(self, X, y): self.model.fit(X,y) def predict(self, X): ''' Simply passes down the prediction from the underlying model. ''' return self.model.predict(X) def save(self, filepath): ''' Persists the trained model to a file. ''' joblib.dump(self.model, create_filename(filepath,'%s.pkl' % self.model_name)) def load(self, filepath): ''' Loads an already train model from a file to perform predictions. ''' self.model = joblib.load(create_filename(filepath,'%s.pkl' % self.model_name)) def score(self, X, y): ''' Lets the user load a previously trained model to predict with it. ''' return self.model.score(X,y)
knn = KNeighborsClassifier(n_neighbors=3) knn.fit(iris.data, iris.target) knn.predict(iris.data) len(iris.target) sum(iris.target == knn.predict(iris.data)) knn.score(iris.data, iris.target) help(cross_val_predict) cross_val_predict(knn, iris.data, iris.target, cv=20) cross_val_score(knn, iris.data, iris.target, cv=20).mean() rf = RandomForestClassifier(n_estimators=3) rf.fit(iris.data, iris.target) rf.predict_proba(iris.data) rf.score(iris.data, iris.target) sum(iris.target == rf.predict(iris.data)) cross_val_score(rf, iris.data, iris.target, cv=20).mean() from sklearn.linear_model import LogisticRegression from sklearn.multiclass import OneVsRestClassifier ''' https://github.com/drivendataorg/box-plots-sklearn/blob/master/src/data/multilabel.py ''' mcr = OneVsRestClassifier(LogisticRegression()) mcr.fit(iris.data, iris.target) mcr.predict(iris.data) mcr.predict_proba(iris.data)
holdout = test # from now on we will refer to this # dataframe as the holdout data all_X = train[columns] all_y = train['Survived'] train_X, test_X, train_y, test_y = train_test_split(all_X, all_y, test_size=0.20, random_state=0) random_forest = RandomForestClassifier(n_estimators=100) random_forest.fit(train_X, train_y) Y_pred = random_forest.predict(test_X) random_forest.score(train_X, train_y) acc_random_forest = round(random_forest.score(train_X, train_y) * 100, 2) print('train accuracy' + str(acc_random_forest)) scores = cross_val_score(random_forest, all_X, all_y, cv=10) accuracy = np.mean(scores) print('Cross_val scores : ' + str(scores)) print('Cross_val accuracy : ' + str(accuracy)) random_forest.fit(all_X, all_y) holdout_predictions = random_forest.predict(holdout[columns]) holdout_ids = holdout["PassengerId"] submission_df = {"PassengerId": holdout_ids, "Survived": holdout_predictions} submission = pd.DataFrame(submission_df) submission.to_csv("submission.csv", index=False)
sub1 = pd.read_csv( '/Users/visheshkochher/Desktop/Python_ML_resources/datasciencedojo/kaggle/everyone_dies/submission_2018-07-13 09:14:21.csv') sub2 = pd.read_csv( '/Users/visheshkochher/Desktop/Python_ML_resources/datasciencedojo/kaggle/everyone_dies/submission_2018-07-11 11:06:07.csv') sub3 = pd.read_csv( '/Users/visheshkochher/Desktop/Python_ML_resources/datasciencedojo/kaggle/everyone_dies/submission_2018-07-13 09:17:57.csv') sub_all = sub3.join(sub1, lsuffix='_1', rsuffix='_2') pd.crosstab(sub_all['Survived_1'], sub_all['Survived_2']) cv_model.cv_results_ ### ASSESS BEST PARAMS TREE AND SCORE tree_model = RandomForestClassifier(random_state=297, **cv_model.best_params_) ####ONLY IF THE PREVIOUS MODEL IS A SearchCV tree_model = tree_model.fit(trainX, trainY.values.ravel()) tree_model.score(trainX, trainY) tree_model.score(testX, testY) ### CHECK IMPORTANCE OF FEATURES feature_importance = pd.DataFrame(tree_model.feature_importances_, index=trainX.columns, columns=['Imp']).reset_index() feature_importance['pk'] = 1 plot_scatter(feature_importance, 'index', 'Imp', 'index') plot_bar(feature_importance, 'index', 'Imp', 'index') ### PREDICT prediction = tree_model.predict(features_all) tree_model.predict_proba(features_all) #### VISUALIZE TREE ### ONLY FOR SIMPLE DECISION TREE # tree.export_graphviz(tree_model,
if len(sys.argv) > 1 and sys.argv[1] == '--skip-train': results = pd.read_csv("./data/results.csv") else: for classification_dataset in classification_dataset_names: print("Starting", classification_dataset) X, y = fetch_data(classification_dataset, return_X_y=True, local_cache_dir='./data/') train_X, test_X, train_y, test_y = train_test_split(X, y) rf = RandomForestClassifier() lexRF = LexicaseForestClassifier() rf.fit(train_X, train_y) lexRF.fit(train_X, train_y) rf_score = rf.score(test_X, test_y) lexRF_score = lexRF.score(test_X, test_y) results['problem'] = results['problem'] + ([classification_dataset] * 2) results['method'] = results['method'] + ['RF', 'LexRF'] results['score'].append(rf_score) results['score'].append(lexRF_score) results = pd.DataFrame(results) results.to_csv("./data/results.csv", index=False) problems = ( results .groupby("problem") .apply(lambda x: x.score.max() - x.score.min()) .where(lambda x: x > 0.05)
#Check that the algorithm used gives good accuracy by using part of the training set to validate train_train, train_test = train_test_split(train, test_size=0.3) #Train model model = RandomForestClassifier(n_estimators=100, oob_score=True, random_state=10, max_features="auto", min_samples_leaf=20) #model=KNeighborsClassifier(n_neighbors=6) #if getting this error, it is because a matrix with 1 column #is being passed in when a 1d array is expected. ravel() will work. #DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel(). if name == 'main': #To resolve this error, convert label values to int or str as float is not a valid label-type #raise ValueError("Unknown label type: %r" % y) ValueError: Unknown label type: array #model.fit(train_train.ix[:,'pixel0':'pixel783'], np.asarray(train_train.ix[:,'label'].astype(int))) #print "model.score:", model.score(train_test.ix[:,'pixel0'0:'pixel783'], np.asarray(train_test.ix[:,'label'].astype(int))) #print "cross validation score:", cross_validation.cross_val_score(model, train_train.ix[:,'pixel0':'pixel783'], train_train.ix[:,'label'], cv=3) model.fit(train_train.loc[:, 'pixel0':'pixel783'], train_train.loc[:, 'label'].values.ravel()) print( "model.score", model.score(train_test.loc[:, 'pixel0':'pixel783'], train_test.loc[:, 'label'].values.ravel())) #Predict output #predicted=model.predict(train_test.ix[:,'pixel0':'pixel783']) #print predicted #print "Accuracy: ", accuracy_score(train_test.ix[:,'label'].astype(int), predicted)
import matplotlib.pyplot as plt import numpy as np import pylab as pl features_train, labels_train, features_test, labels_test = makeTerrainData() ########################## DECISION TREE ################################# ### your code goes here--now create 2 decision tree classifiers, ### one with min_samples_split=2 and one with min_samples_split=50 ### compute the accuracies on the testing data and store ### the accuracy numbers to acc_min_samples_split_2 and ### acc_min_samples_split_50, respectively clf2 = RandomForestClassifier(min_samples_split=2) clf2.fit(features_train, labels_train) clf50 = RandomForestClassifier(min_samples_split=50) clf50.fit(features_train, labels_train) acc_min_samples_split_2 = clf2.score(features_test, labels_test) acc_min_samples_split_50 = clf50.score(features_test, labels_test) def submitAccuracies(): return {"acc_min_samples_split_2":round(acc_min_samples_split_2,3), "acc_min_samples_split_50":round(acc_min_samples_split_50,3)}
#Check if there is linear correlation between pixel<x> columns and label #If yes, we should dive into the columns with correlation. Linear / logistic regression may work well with the data. #In this case, makes sense that there is no correlation - higher pixel values does not mean that label value will be higher #print "Correlation:", train.corr()["label"] #Check that the algorithm used gives good accuracy by using part of the training set to validate train_train, train_test=train_test_split(train, test_size=0.3) #Train model model=RandomForestClassifier(n_estimators = 100, oob_score = True, random_state =10, max_features = "auto", min_samples_leaf = 20) #model=KNeighborsClassifier(n_neighbors=6) #if getting this error, it is because a matrix with 1 column #is being passed in when a 1d array is expected. ravel() will work. #DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel(). if name == 'main': #To resolve this error, convert label values to int or str as float is not a valid label-type #raise ValueError("Unknown label type: %r" % y) ValueError: Unknown label type: array #model.fit(train_train.ix[:,'pixel0':'pixel783'], np.asarray(train_train.ix[:,'label'].astype(int))) #print "model.score:", model.score(train_test.ix[:,'pixel0':'pixel783'], np.asarray(train_test.ix[:,'label'].astype(int))) #print "cross validation score:", cross_validation.cross_val_score(model, train_train.ix[:,'pixel0':'pixel783'], train_train.ix[:,'label'], cv=3) model.fit(train_train.ix[:,'pixel0':'pixel783'], train_train.ix[:,'label'].values.ravel()) print "model.score", model.score(train_test.ix[:,'pixel0':'pixel783'], train_test.ix[:,'label'].values.ravel()) #Predict output #predicted=model.predict(train_test.ix[:,'pixel0':'pixel783']) #print predicted #print "Accuracy: ", accuracy_score(train_test.ix[:,'label'].astype(int), predicted)
dataset.columns = feature_names #split into train and test set train, test = train_test_split(dataset, test_size=0.3) #normalize data df_scaled_train = pd.DataFrame(preprocessing.scale(train), columns=feature_names) df_scaled_test = pd.DataFrame(preprocessing.scale(test), columns=feature_names) model = RandomForestClassifier(n_estimators=100, oob_score=True, random_state=10, max_features="auto", min_samples_leaf=20) #train model #if getting this error, it is because a matrix with 1 column #is being passed in when a 1d array is expected. ravel() will work. #DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel(). if name == 'main': #To resolve this error, convert label values to int or str as float is not a valid label-type #raise ValueError("Unknown label type: %r" % y) ValueError: Unknown label type: array model.fit(df_scaled_train.ix[:, 'times pregnant':'age (years)'], np.asarray(df_scaled_train.ix[:, 'target'].astype(int))) print "Accuracy:", model.score( df_scaled_test.ix[:, 'times pregnant':'age (years)'], np.asarray(df_scaled_test.ix[:, 'target'].astype(int))) #predict output predicted = model.predict(df_scaled_test.ix[:, 'times pregnant':'age (years)']) print predicted
import dataProcessing import joblib from sklearn.ensemble.forest import RandomForestClassifier from sklearn.model_selection import train_test_split data = dataProcessing.readJson("activity1.json") + dataProcessing.readJson( "activity2.json") + dataProcessing.readJson( "activity3.json") + dataProcessing.readJson("activity4.json") Data = dataProcessing.generateDataSet(data) X_train, X_test, y_train, y_test = train_test_split(Data, dataProcessing.target, test_size=0.3) rfc = RandomForestClassifier() rfc = rfc.fit(X_train, y_train) result = rfc.score(X_test, y_test) print(result) print(rfc.predict(X_test)) joblib.dump(rfc, "rfcModel.pkl")
#download the file raw_data=urllib.urlopen(url) #get data, add column names and index feature_names=["times pregnant", "plasma glucose conc.", "distolic blood pressure (mm Hg)", "triceps skin fold thickness (mm)", "2-hour serum insulin (mu U/ml)", "body mass index (kg/m^2)", "diabetes pedigree function", "age (years)", "target"] dataset=pd.DataFrame.from_csv(raw_data) dataset=dataset.reset_index() dataset.columns=feature_names #split into train and test set train, test=train_test_split(dataset, test_size=0.3) #normalize data df_scaled_train=pd.DataFrame(preprocessing.scale(train), columns=feature_names) df_scaled_test=pd.DataFrame(preprocessing.scale(test), columns=feature_names) model=RandomForestClassifier(n_estimators = 100, oob_score = True, random_state =10, max_features = "auto", min_samples_leaf = 20) #train model #if getting this error, it is because a matrix with 1 column #is being passed in when a 1d array is expected. ravel() will work. #DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel(). if name == 'main': #To resolve this error, convert label values to int or str as float is not a valid label-type #raise ValueError("Unknown label type: %r" % y) ValueError: Unknown label type: array model.fit(df_scaled_train.ix[:,'times pregnant':'age (years)'], np.asarray(df_scaled_train.ix[:,'target'].astype(int))) print "Accuracy:", model.score(df_scaled_test.ix[:,'times pregnant':'age (years)'], np.asarray(df_scaled_test.ix[:,'target'].astype(int))) #predict output predicted=model.predict(df_scaled_test.ix[:,'times pregnant':'age (years)']) print predicted
else: for classification_dataset in classification_dataset_names: print("Starting", classification_dataset) X, y = fetch_data(classification_dataset, return_X_y=True, local_cache_dir='./data/') train_X, test_X, train_y, test_y = train_test_split(X, y) rf = RandomForestClassifier() lexRF = LexicaseForestClassifier() rf.fit(train_X, train_y) lexRF.fit(train_X, train_y) rf_score = rf.score(test_X, test_y) lexRF_score = lexRF.score(test_X, test_y) results['problem'] = results['problem'] + ( [classification_dataset] * 2) results['method'] = results['method'] + ['RF', 'LexRF'] results['score'].append(rf_score) results['score'].append(lexRF_score) results = pd.DataFrame(results) results.to_csv("./data/results.csv", index=False) problems = (results.groupby("problem").apply(lambda x: x.score.max( ) - x.score.min()).where(lambda x: x > 0.05).dropna().index.values) viz_data = results[[x in problems for x in results.problem]]
from prep_terrain_data import makeTerrainData from sklearn.ensemble.forest import RandomForestClassifier import numpy as np import pylab as pl features_train, labels_train, features_test, labels_test = makeTerrainData() ################################################################################# ########################## DECISION TREE ################################# #### your code goes here clf = RandomForestClassifier() clf.fit(features_train, labels_train) acc = clf.score(features_test, labels_test) ### be sure to compute the accuracy on the test set def submitAccuracies(): return {"acc":round(acc,3)}