data["class"] = np.where(data["trending_time"]<=3., 1, 0.) #Data X = np.array(data_norm[["likes", "dislikes", "views", "comment_count", "trending_time"]]) y = np.array(data["class"]).squeeze() attributeNames = ["likes", "dislikes", "views", "comment_count", "trending_time"] N, M = X.shape #Bruger K-fold crossvalidation til at estimatere antallet af naboer k, til k-nearest neighbour classifier #k-fold cross validation with classifier K = 10 CV = model_selection.KFold(K, shuffle=True) L=40 # Maximum number of neighbors errors = np.zeros((N,L)) K = 10 internal_cross_validation = 10 k1=0 for train_index, test_index in CV1.split(X,y): # extract training and test set for current CV fold Xo_train = X[train_index,:] yo_train = y[train_index] Xo_test = X[test_index,:] yo_test = y[test_index]
import sklearn.linear_model as lm from sklearn import model_selection from toolbox_02450 import feature_selector_lr, bmplot import numpy as np # Load data from matlab file mat_data = loadmat('../Data/body.mat') X = mat_data['X'] y = mat_data['y'].squeeze() attributeNames = [name[0] for name in mat_data['attributeNames'][0]] N, M = X.shape ## Crossvalidation # Create crossvalidation partition for evaluation K = 5 CV = model_selection.KFold(n_splits=K, shuffle=True) # Initialize variables Features = np.zeros((M, K)) Error_train = np.empty((K, 1)) Error_test = np.empty((K, 1)) Error_train_fs = np.empty((K, 1)) Error_test_fs = np.empty((K, 1)) Error_train_nofeatures = np.empty((K, 1)) Error_test_nofeatures = np.empty((K, 1)) k = 0 for train_index, test_index in CV.split(X): # extract training and test set for current CV fold X_train = X[train_index, :]
# In[ ]: X = array[:, 0:2] # In[ ]: Y = array[:, 3] # In[ ]: seed = 1234567890 # In[ ]: kfold = model_selection.KFold(n_splits=10, random_state=seed) # In[ ]: model = LogisticRegression() # In[ ]: scoring = 'accuracy' # In[ ]: results = model_selection.cross_val_score(model, X, Y, cv=kfold,
def __cross_validation(self, classifier, X, y, k, stratify=True): """ Performs classifier validation through cross-validation. This function is also used by leave-one-out validation. :param classifier: classifier for validation :type classifier: sklearn classifier object :param X: feature values of training data (including training and validation sets) :type X: pandas dataframe :param y: labels of training data :type y: pandas series :param k: number of folds :type k: int :param stratify: draw samples according to class proportions or not :type stratify: bool :returns: performance metrics on training and validation data """ if k == X.shape[0]: # leave-one-out kf = model_selection.KFold(n_splits=k) else: if stratify: kf = model_selection.StratifiedKFold(n_splits=k, shuffle=True, random_state=0) else: kf = model_selection.KFold(n_splits=k, shuffle=True, random_state=0) # training data and predictions for each fold y_train_list = [] y_train_pred_list = [] y_train_prob_list = [] y_val_list = [] y_val_pred_list = [] y_val_prob_list = [] for train_idx, val_idx in kf.split(X, y): X_train, X_val = X.iloc[train_idx], X.iloc[val_idx] y_train, y_val = y.iloc[train_idx], y.iloc[val_idx] y_train_list.append(y_train) y_val_list.append(y_val) # catch convergence warning with warnings.catch_warnings(): warnings.filterwarnings('error', category=exceptions.ConvergenceWarning) try: classifier = classifier.fit(X_train, y_train) except exceptions.ConvergenceWarning: Model.counter -= 1 raise y_train_pred_list.append(classifier.predict(X_train)) y_val_pred_list.append(classifier.predict(X_val)) y_train_prob_list.append(classifier.predict_proba(X_train)) y_val_prob_list.append(classifier.predict_proba(X_val)) if k == X.shape[0]: # leave-one-out y_val = np.hstack(y_val_list) y_val_pred = np.hstack(y_val_pred_list) y_val_prob = np.vstack(y_val_prob_list) return ModelMetrics(classifier, y_train_list, y_train_pred_list, y_train_prob_list, 'cv'), \ ModelMetrics(classifier, y_val, y_val_pred, y_val_prob, 'loo') else: return ModelMetrics(classifier, y_train_list, y_train_pred_list, y_train_prob_list, 'cv'), \ ModelMetrics(classifier, y_val_list, y_val_pred_list, y_val_prob_list, 'cv')
def compare_predictions(df, y_var_name, percent_data=None, category_limit=11, knots=3, alphas=np.logspace(start=-2, stop=10, num=50), corr_matrix=True, scatter_matrix=True, bootstrap_coefs=True, feature_importances=True, partial_dep=True, actual_vs_predicted=True, residuals=True, univariates=True, compare_models=True, ROC=True, bootstraps=10): """Takes dataframe INPUT: name: string, a feature name to spline knots: int, number knots (divisions) which are divisions between splines. OUTPUT: pipeline """ starttotal = time() df, sample_limit = clean_dataframe(df, y_var_name, percent_data) # REMEMBER OLD DATAFRAME df_unpiped, df_X_unpiped = df.copy(), df.copy().drop(y_var_name, axis=1) (unpiped_continuous_features, unpiped_category_features) = sort_features(df_X_unpiped) columns_unpiped = df_X_unpiped.columns # REMOVE CATEGORICAL VARIABLES THAT HAVE TOO MANY CATEGORIES TO BE USEFUL df = drop_category_exeeding_limit(df, y_var_name, category_limit) # SHOW CORRELATION MATRIX if corr_matrix: if len(unpiped_continuous_features) > 0: timeit(plt.matshow, df.sample(sample_limit).corr()) # MAKE SCATTER MATRIX if scatter_matrix: if len(unpiped_continuous_features) > 0: timeit(plot_scatter_matrix, df, y_var_name, colors=True) plt.show() # TRANSFORM DATAFRAME print('DF COLUMNS: \n' + str(list(df.columns)) + '\n') df, df_X, X, y, pipeline = use_spline(df, y_var_name) print('DF COLUMNS AFTER TRANSFORM: \n' + str(list(df.columns)) + '\n') # MAKE MODELS (names_models, continuous_features, category_features, models, scoring, is_continuous, alphas) = make_models(df, df_X, y, y_var_name, univariates, alphas) # evaluate each model in turn fit_models, results, names, y_hats, errors, seed = [], [], [], [], [], 7 for name, model in tqdm.tqdm(names_models): # if not linear: change df_X to df_X unpiped kfold = model_selection.KFold(n_splits=10, random_state=seed) if name == 'RR' or name == 'LASSO': alpha, cv_results = timeit(plot_choose_alpha, df, model, y_var_name, alphas, kfold, scoring) model = model(alpha) else: cv_results = timeit(cross_val_score, model, X, y, cv=kfold, scoring=scoring) results.append(cv_results) names.append(name) msg = "%s: mean=%f std=%f" % (name, cv_results.mean(), cv_results.std()) print(msg) # OTHER CROSS VALIDATE METHOD: # FIT MODEL WITH ALL DATA model.fit(X, y) fit_models.append(model) # PLOT PREDICTED VS ACTUALS if is_continuous: timeit(plot_predicted_vs_actuals, df, model, y_var_name, sample_limit) plt.show() # MAKE BOOTSTRAPS if bootstrap_coefs or partial_dep: bootstrap_models = bootstrap_train_premade(model, X, y, bootstraps=bootstraps, fit_intercept=False) # PLOT COEFFICIANTS if hasattr(model, "coef_"): coefs = model.coef_ columns = list(df.drop(y_var_name, axis=1).columns) while (type(coefs[0]) is list) or (type(coefs[0]) is np.ndarray): coefs = list(coefs[0]) timeit(plot_coefs, coefs=coefs, columns=columns, graph_name=name) plt.show() # PLOT BOOTSTRAP COEFFICIANTS if is_continuous: if bootstrap_coefs: # PLOT BOOTSTRAP COEFS fig, axs = timeit(plot_bootstrap_coefs, bootstrap_models, df_X.columns, n_col=4) fig.tight_layout() plt.show() # PLOT FEATURE IMPORTANCES if feature_importances: if 'feature_importances_' in dir(model): timeit(plot_feature_importances, model, df_X) plt.show() # PLOT PARTIAL DEPENDENCIES if partial_dep: timeit(plot_partial_dependences, model, X=df_X_unpiped, var_names=unpiped_continuous_features, y=y, bootstrap_models=bootstrap_models, pipeline=pipeline, n_points=250) plt.tight_layout() plt.show() # PLOT PREDICTED VS ACTUALS plot_continuous_error_graphs(df, y, y_var_name, model, is_continuous, sample_limit, predicteds_vs_actuals=True, residuals=True) df_X = df.drop(y_var_name, axis=1) # GET ERROR y_hat, error = get_error(name, model, df_X, y, is_continuous) y_hats.append(y_hat) errors.append(error) # --COMPARE MODELS-- if compare_models: choose_box_and_violin_plots(names, scoring, compare_models, results, is_continuous) # ROC CURVE if ROC: if not is_continuous: timeit(plot_rocs, models, df_X, y) plt.show() print(f'MAKE SUBSAMPLE TIME: {time() - starttotal}') return names, results, fit_models, pipeline, df_X, y_hats, errors
def models(df): df = getFeaturesForModels(df) array = df.values X = array[:, 0:22] Y = array[:, 23] validation_size = 0.20 seed = 7 X_train, X_validation, Y_train, Y_validation = model_selection.train_test_split( X, Y, test_size=validation_size, random_state=seed) scoring = 'accuracy' models = [] models.append(('LR', LogisticRegression())) models.append(('LDA', LinearDiscriminantAnalysis())) models.append(('KNN', KNeighborsClassifier())) models.append(('CART', DecisionTreeClassifier())) models.append(('NB', GaussianNB())) models.append(('SVM', SVC())) # # evaluate each model in turn results = [] names = [] for name, model in models: kfold = model_selection.KFold(n_splits=10, random_state=seed) cv_results = model_selection.cross_val_score(model, X_train, Y_train, cv=kfold, scoring=scoring) results.append(cv_results) names.append(name) msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std()) print(msg) print "KNeighborsClassifier" knn = KNeighborsClassifier() knn.fit(X_train, Y_train) predictions = knn.predict(X_validation) print(accuracy_score(Y_validation, predictions)) print(confusion_matrix(Y_validation, predictions)) print(classification_report(Y_validation, predictions)) print "CART" CART = DecisionTreeClassifier() CART.fit(X_train, Y_train) predictions = CART.predict(X_validation) print(accuracy_score(Y_validation, predictions)) print(confusion_matrix(Y_validation, predictions)) print(classification_report(Y_validation, predictions)) print "LR" LR = LogisticRegression() LR.fit(X_train, Y_train) predictions = LR.predict(X_validation) print(accuracy_score(Y_validation, predictions)) print(confusion_matrix(Y_validation, predictions)) print(classification_report(Y_validation, predictions)) print "LDA" LDA = LinearDiscriminantAnalysis() LDA.fit(X_train, Y_train) predictions = LDA.predict(X_validation) print(accuracy_score(Y_validation, predictions)) print(confusion_matrix(Y_validation, predictions)) print(classification_report(Y_validation, predictions)) print "NB" NB = GaussianNB() NB.fit(X_train, Y_train) predictions = NB.predict(X_validation) print(accuracy_score(Y_validation, predictions)) print(confusion_matrix(Y_validation, predictions)) print(classification_report(Y_validation, predictions)) print "SVM" SVM = GaussianNB() SVM.fit(X_train, Y_train) predictions = SVM.predict(X_validation) print(accuracy_score(Y_validation, predictions)) print(confusion_matrix(Y_validation, predictions)) precision = float(confusion_matrix( Y_validation, predictions)[1][1]) / float( (confusion_matrix(Y_validation, predictions)[1][1]) + (confusion_matrix(Y_validation, predictions)[0][1])) print precision print(classification_report(Y_validation, predictions))
def analyse(model_name): try: ### read in the data (HEP) data = pd.read_csv( "https://s3-us-west-2.amazonaws.com/iqoqo.temp/demo/all_train_10000.csv" ) data.head() data_to_use = data data_to_use.dropna(inplace=True) data_to_use.head() values = data_to_use.values Y = values[:, 0] X = values[:, 1:28] print("--- starting " + model_name + " analysis ---") model = [] if model_name == "LogReg": model.append(LogisticRegression()) elif model_name == "SVM": model.append(SVC()) elif model_name == "DecTree": model.append(DecisionTreeClassifier()) elif model_name == "KNN": model.append(KNeighborsClassifier()) elif model_name == "LinDisc": model.append(LinearDiscriminantAnalysis()) elif model_name == "GaussianNB": model.append(GaussianNB()) elif model_name == "MLP": model.append(MLPClassifier()) elif model_name == "GaussianPC": model.append(GaussianProcessClassifier()) elif model_name == "RandomForest": model.append(RandomForestClassifier()) elif model_name == "AdaBoost": model.append(AdaBoostClassifier()) elif model_name == "QuadraticDisc": model.append(QuadraticDiscriminantAnalysis()) elif model_name == "SVClinear": model.append(SVC(kernel="linear", C=0.025)) elif model_name == "SVCgamma": model.append(SVC(gamma=2, C=1)) elif model_name == "KNN3": model.append(KNeighborsClassifier(3)) elif model_name == "GaussianRBF": model.append(GaussianProcessClassifier(1.0 * RBF(1.0))) elif model_name == "DecTreeDepth": model.append(DecisionTreeClassifier(max_depth=5)) elif model_name == "RandomForestDepth": model.append( RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1)) elif model_name == "MLPalpha": model.append(MLPClassifier(alpha=1)) else: print("Model name not found: " + model_name) quit() k_fold_validation = model_selection.KFold(n_splits=10, random_state=random_seed) results = model_selection.cross_val_score(model[0], X, Y, cv=k_fold_validation, scoring="accuracy") output_message = "%s| Mean=%f STD=%f" % ( model_name, results.mean(), results.std(), ) print(output_message) print("--- done " + model_name + " analysis ---") return model_name, results except: return model_name, []
data_test.loc[data_test["Embarked"] == "Q", "Embarked"] = 2 test_features = ["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked"] # 构造测试集的Survived列, data_test["Survived"] = -1 test_predictors = data_test[test_features] data_test["Survived"] = logRegAlg.predict(test_predictors) print(data_test.head(10)) # 使用随机森林算法 predictors = ["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked"] # 10棵决策树,停止的条件:样本个数为2,叶子节点个数为1 alg = RandomForestClassifier(random_state=1, n_estimators=10, min_samples_split=2, min_samples_leaf=1) kf = model_selection.KFold(n_splits=3, shuffle=False, random_state=1) scores = model_selection.cross_val_score(alg, data_train[predictors], data_train["Survived"], cv=kf) print(scores) print(scores.mean()) # 增加决策树的个数到30棵决策树,交叉验证方法采用10折交叉验证 alg = RandomForestClassifier(random_state=1, n_estimators=30, min_samples_split=2, min_samples_leaf=1) kf = model_selection.KFold(n_splits=10, shuffle=False, random_state=1) scores = model_selection.cross_val_score(alg, data_train[predictors],
learning_goal = 1 # stop criterion 1 (train mse to be reached) max_epochs = 300 # stop criterion 2 (max epochs in training) show_error_freq = 50 # frequency of training status updates # Getting the min max range for every attribute and the y vector # minMaxRange = [[min(X[:, 0]), max (X[:, 0])], [min(X[:, 1]), max(X[:, 1])], [min(X[:, 2]), max(X[:,2])], # [min(X[:, 3]), max (X[:, 3])], [min(X[:, 4]), max(X[:, 4])], [min(X[:, 5]), max(X[:,5])], # [min(X[:, 6]), max(X[:, 6])], [min(X[:, 7]), max(X[:, 7])]] minMaxRange = [[0, 1]] * M # K-fold crossvalidation # Outer loop K = 5 # Inner Loop J = 3 CV_1 = model_selection.KFold(K, shuffle=False) CV_2 = model_selection.KFold(J, shuffle=False) # Variable for classification error errors = np.zeros((J, K)) * np.nan gen_errors = np.zeros(K) * np.nan error_hist = np.zeros((max_epochs, K)) * np.nan bestnet = list() best_bestnet = list() k = 0 bestnet_hidden_units = np.zeros((J, K)) * np.nan y_best_est = [] best_performing_anns = [] best_hidden_neurons_outer = [] mean_errors = np.zeros(K) * np.nan for train_index, test_index in CV_1.split(X, y):
* 保存交叉验证在训练集上的结果,并保存下来计算gini ''' start_all0 = datetime.datetime.now() test_x,testID=deal_test() train_x,train_y=deal_train() train_x.fillna(0) test_x.fillna(0) print("*******DATA*******:{:.2f} hours".format((datetime.datetime.now()-start_all0).seconds/3600)) start_all = datetime.datetime.now() #-----model------ n_split=5 cv_split = model_selection.KFold(n_splits=n_split, random_state=15, shuffle=False) gbm=lgb.LGBMRegressor( objective='regression',num_leaves=6, learning_rate=0.02, n_estimators=500, max_bin = 55, bagging_fraction = 0.8, bagging_freq = 5, feature_fraction = 0.5, feature_fraction_seed=None, bagging_seed=None, min_data_in_leaf =1, min_sum_hessian_in_leaf = 11,min_data=1 ) #保存每个交叉验证在y_test集上的结果 y_pred_test_cv=np.ones((test_x.shape[0],n_split)) y_pred_train_cv=np.ones((train_x.shape[0],)) for i, (train_index,test_index) in enumerate(cv_split.split(train_x,train_y)): gbm.fit(train_x.iloc[train_index],train_y[train_index])
from sklearn.tree import DecisionTreeClassifier from sklearn import model_selection as ms if __name__ == '__main__': df = pd.read_csv('cleaned.csv') data = df.copy() #feature selection cols = ['Age','SibSp','Parch','male','female','class1',\ 'class2','class3','embarkedS','embarkedC','embarkedQ'] X = data[cols] y = data['Survived'] #train-test split X_train, X_test, y_train, y_test = ms.train_test_split(X, y, test_size=0.3, random_state=0) clf = DecisionTreeClassifier(random_state=0) clf.fit(X_train, y_train) y_pred = clf.predict(X_test) print(clf.score(X_test, y_test)) #one step forward??(similar to ARIMA??) #cross validation kfold = ms.KFold(n_splits=10, random_state=0) model_cv = DecisionTreeClassifier(random_state=0) results_cv = ms.cross_val_score(model_cv,X_train, y_train, cv = kfold,\ scoring = 'accuracy') print(results_cv.mean()) #how to improve accuracy?? (parameters of the classifier??)
#Logistic Regression logreg = LogisticRegression() logreg.fit(X_train, y_train) y_pred = logreg.predict(X_test) print(confusion_matrix(y_test, y_pred)) print(classification_report(y_test, y_pred)) # In[ ]: # Bagged Decision Trees for Classification kfold = model_selection.KFold(n_splits=10, random_state=10) model_1 = BaggingClassifier(base_estimator=DecisionTreeClassifier(), n_estimators=100, random_state=10) results_1 = model_selection.cross_val_score(model_1, x, y, cv=kfold) print(results_1.mean()) # In[32]: # Random Forest Classification kfold_rf = model_selection.KFold(n_splits=10) model_rf = RandomForestClassifier(n_estimators=100, max_features=5) results_rf = model_selection.cross_val_score(model_rf, x, y, cv=kfold_rf) print(results_rf.mean())
# split the model into train and test set iris_data_train, iris_data_test, iris_targets_train , iris_targets_test = \ model_selection.train_test_split(iris_data, iris_targets, test_size=0.25) # K-nearest neighbor classifier knn_iris = neighbors.KNeighborsClassifier(n_neighbors=13) knn_iris.fit(iris_data_train, iris_targets_train) iris_target_pred_knn = knn_iris.predict(iris_data_test) # Another way to get accuracy explicitly is: np.mean(y == y_pred) print("KNN Accuracy (Iris Dataset):", metrics.accuracy_score(iris_targets_test, iris_target_pred_knn)) # K-fold cross validation iris_kfold = model_selection.KFold(n_splits=4, shuffle=True) cv_score = model_selection.cross_val_score(knn_iris, X=iris_data, y=iris_targets, cv=iris_kfold) print("Cross-validation score is %s" % cv_score, "Mean CV is %s" % np.mean(cv_score)) # Compare the performance of kNN for different k: k_accuracy_scores = np.zeros((49, 400)) # for k in range(2, 51): # for rep in range(1, 400): # iris_data_train, iris_data_test, iris_targets_train, iris_targets_test = \ # model_selection.train_test_split(iris_data, iris_targets, test_size=0.25) # knn_test_iris = neighbors.KNeighborsClassifier(n_neighbors=k) # knn_test_iris.fit(iris_data_train, iris_targets_train)
def main(): # load dataset url_dataset = "script/mushrooms.csv" dataset = pandas.read_csv(url_dataset) # check the attibutes get_unique_attribute(dataset) #remove veil-type attribute del dataset['veil-type'] #print(dataset.columns.values) # encode the attributes with LabelEncoder to_be_encoded_cols = dataset.columns.values label_encode(dataset, to_be_encoded_cols) # check the attibutes after encoded #get_unique_attribute(dataset) # split-out validation dataset array = dataset.values X = array[:, 0:22] Y = array[:, 0] validation_size = 0.20 seed = 7 scoring = "accuracy" X_train, X_validation, Y_train, Y_validation = model_selection.train_test_split( X, Y, test_size=validation_size, random_state=seed) # Spot Check Algorithms models = [] models.append(('LR', LogisticRegression())) models.append(('RF', RandomForestClassifier())) models.append(('KNN', KNeighborsClassifier())) models.append(('TREE', DecisionTreeClassifier())) models.append(('NB', GaussianNB())) models.append(('SVM', SVC())) # evaluate each model in turn results = [] names = [] for name, model in models: kfold = model_selection.KFold(n_splits=10, random_state=seed) cv_results = model_selection.cross_val_score(model, X_train, Y_train, cv=kfold, scoring=scoring) results.append(cv_results) names.append(name) msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std()) print(msg) # boxplot algorithm comparison fig = plt.figure() fig.suptitle('Algorithm Comparison') ax = fig.add_subplot(111) plt.boxplot(results) ax.set_xticklabels(names) plt.show() # feature extraction test = SelectKBest(score_func=chi2, k=4) fit = test.fit(X, Y) # summarize scores np.set_printoptions(precision=3) print(fit.scores_) # feature extraction model = LogisticRegression() rfe = RFE(model, 3) fit = rfe.fit(X, Y) print("Num Features: {}".format(fit.n_features_)) print("Selected Features: {}".format(fit.support_)) print("Feature Ranking: {}".format(fit.ranking_)) # feature extraction model = ExtraTreesClassifier() model.fit(X, Y) print(model.feature_importances_) print("The Conclusion") print( "Best Machine Learning Type for Mushroom Dataset is Binary Classification Model" ) print( "because basicaly, the model is only divided to be two classes that are edible(e) and poisonous(p)" ) print( "The Best Algorithm for Mushroom Dataset is DecisionTree, because DecisionTree is the most stable than others" ) print( "If we change the seed,numbers of dataset itself whether increase or reduce,we can see that the accuracy of decision tree is highest and stable" ) print( "As We can see at the result above(there are three feature selections/extractions), and we can pull the decision that Odor is the indicatived feature" ) print( "The Most Importance or Indicative Feature/Attribute is Odor and it has big influence to predict whether the mushroom is edible/poisonous" ) print( "Odor with value except Almond/None/Anise tend to be most indicatived attribute of poisonous mushroom" )
return K.sqrt(K.mean(K.square(y_pred - y_true))) i = 0 nbag = 1 nfold = 5 oobval = np.zeros((train_df.shape[0], 2)) oobtest = np.zeros((test_df.shape[0], 2)) valerr = [] val_scores = [] np.random.seed(2018) for x in np.arange(nbag): for seed in [2018]: kf = model_selection.KFold(n_splits=nfold, shuffle=True, random_state=seed) for dev_index, val_index in kf.split(y): dev_X, val_X = train_df.values[dev_index, :], train_df.values[ val_index, :] dev_y, val_y = y[dev_index], y[val_index] param_train_tfidf_dev, param_train_tfidf_val = param_train_tfidf[ dev_index, :], param_train_tfidf[val_index, :] title_train_tfidf_dev, title_train_tfidf_val = title_train_tfidf[ dev_index, :], title_train_tfidf[val_index, :] desc_train_tfidf_dev, desc_train_tfidf_val = desc_train_tfidf[ dev_index, :], desc_train_tfidf[val_index, :] region_dev, region_val = region_train[dev_index, :], region_train[ val_index, :] pcn_dev, pcn_val = pcn_train[dev_index, :], pcn_train[val_index, :] cn_dev, cn_val = cn_train[dev_index, :], cn_train[val_index, :]
def dataupload(): if request.method == 'POST' and 'csv_data' in request.files: file = request.files['csv_data'] filename = secure_filename(file.filename) # os.path.join is used so that paths work in every operating system # file.save(os.path.join("wherever","you","want",filename)) file.save(os.path.join('static/uploadsDB', filename)) fullfile = os.path.join('static/uploadsDB', filename) # For Time date = str( datetime.datetime.fromtimestamp( time.time()).strftime("%Y-%m-%d %H:%M:%S")) # EDA function df = pd.read_csv(os.path.join('static/uploadsDB', filename)) df_size = df.size df_shape = df.shape df_columns = list(df.columns) df_targetname = df[df.columns[-1]].name df_featurenames = df_columns[0: -1] # select all columns till last column df_Xfeatures = df.iloc[:, 0:-1] df_Ylabels = df[df.columns[-1]] # Select the last column as target # same as above df_Ylabels = df.iloc[:,-1] # Model Building X = df_Xfeatures Y = df_Ylabels seed = 7 # prepare models models = [] models.append(('LR', LogisticRegression())) models.append(('LDA', LinearDiscriminantAnalysis())) models.append(('KNN', KNeighborsClassifier())) models.append(('CART', DecisionTreeClassifier())) models.append(('NB', GaussianNB())) models.append(('SVM', SVC())) # evaluate each model in turn results = [] names = [] allmodels = [] scoring = 'accuracy' for name, model in models: kfold = model_selection.KFold(n_splits=10, random_state=seed) cv_results = model_selection.cross_val_score(model, X, Y, cv=kfold, scoring=scoring) results.append(cv_results) names.append(name) msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std()) allmodels.append(msg) model_results = results model_names = names # Saving Results of Uploaded Files to Sqlite DB newfile = FileContents(name=file.filename, data=file.read(), modeldata=msg) db.session.add(newfile) db.session.commit() return render_template('details.html', filename=filename, date=date, df_size=df_size, df_shape=df_shape, df_columns=df_columns, df_targetname=df_targetname, model_results=allmodels, model_names=names, fullfile=fullfile, dfplot=df)
def binaryClassifiers(train_bag, train_class, test_bag, test_class): print "Naive Bayes" start = time.time() naive = MultinomialNB() predictFitBinary(naive, train_bag, train_class, test_bag, test_class) end = time.time() print "time: ", (end - start) # calcROC(naive, test_bag, test_class, 'NB') print print "Logistic Regression" start = time.time() logreg = linear_model.LogisticRegression() predictFitBinary(logreg, train_bag, train_class, test_bag, test_class) end = time.time() print "time: ", (end - start) # calcROC(logreg, test_bag, test_class, 'LR' ) print print "SVM (Linear Kernel)" svml = SVC(kernel='linear', probability=True) start = time.time() predictFitBinary(svml, train_bag, train_class, test_bag, test_class) end = time.time() print "time: ", (end - start) # calcROC(svml, test_bag, test_class, 'SVML') print print "SVM (Gaussian Kernel)" start = time.time() svmg = SVC(kernel='rbf', probability=True) predictFitBinary(svmg, train_bag, train_class, test_bag, test_class) end = time.time() print "time: ", (end - start) # calcROC(svmg, test_bag, test_class, 'SVMG') print print "Decision Tree" start = time.time() dt = DecisionTreeClassifier(min_samples_split=20, random_state=99) predictFitBinary(dt, train_bag, train_class, test_bag, test_class) end = time.time() print "time: ", (end - start) # calcROC(dt, test_bag, test_class, 'DT') print "K Nearest Neighbors" start = time.time() knn = linear_model.LogisticRegression() predictFitBinary(logreg, train_bag, train_class, test_bag, test_class) end = time.time() print "time: ", (end - start) # calcROC(logreg, test_bag, test_class, 'LR' ) print # adapted from http://machinelearningmastery.com/ensemble-machine-learning-algorithms-python-scikit-learn/ print "Random Forest Classifier" seed = 7 num_trees = 100 max_features = 100 kfold = model_selection.KFold(n_splits=10, random_state=seed) rf = RandomForestClassifier(n_estimators=num_trees, max_features=max_features) predictFitBinary(rf, train_bag, train_class, test_bag, test_class) results = model_selection.cross_val_score(rf, train_bag, train_class, cv=kfold) print(results.mean())
def main(configuration_path, signal_path, predictions_path, disp_model_path, sign_model_path, key, verbose): ''' Train two learners to be able to reconstruct the source position. One regressor for disp and one classifier for the sign of delta. Both pmml and pickle format are supported for the output. CONFIGURATION_PATH: Path to the config yaml file SIGNAL_PATH: Path to the signal data PREDICTIONS_PATH : path to the file where the mc predictions are stored. DISP_MODEL_PATH: Path to save the disp model to. SIGN_MODEL_PATH: Path to save the disp model to. Allowed extensions are .pkl and .pmml. If extension is .pmml, then both pmml and pkl file will be saved ''' logging.basicConfig(level=logging.DEBUG if verbose else logging.INFO) log = logging.getLogger() config = AICTConfig.from_yaml(configuration_path) model_config = config.disp np.random.seed(config.seed) disp_regressor = model_config.disp_regressor sign_classifier = model_config.sign_classifier disp_regressor.random_state = config.seed sign_classifier.random_state = config.seed log.info('Loading data') df = read_telescope_data( signal_path, config, model_config.columns_to_read_train, feature_generation_config=model_config.feature_generation, n_sample=model_config.n_signal) log.info('Total number of events: {}'.format(len(df))) source_x, source_y = horizontal_to_camera( az=df[model_config.source_az_column], zd=df[model_config.source_zd_column], az_pointing=df[model_config.pointing_az_column], zd_pointing=df[model_config.pointing_zd_column], ) df['true_disp'], df['true_sign'] = calc_true_disp( source_x, source_y, df[model_config.cog_x_column], df[model_config.cog_y_column], df[model_config.delta_column], ) # generate features if given in config if model_config.feature_generation: feature_generation(df, model_config.feature_generation, inplace=True) df_train = convert_to_float32(df[config.disp.features]) df_train.dropna(how='any', inplace=True) log.info('Events after nan-dropping: {} '.format(len(df_train))) target_disp = df['true_disp'].loc[df_train.index] target_sign = df['true_sign'].loc[df_train.index] log.info('Starting {} fold cross validation... '.format( model_config.n_cross_validations)) scores_disp = [] scores_sign = [] cv_predictions = [] kfold = model_selection.KFold( n_splits=model_config.n_cross_validations, shuffle=True, random_state=config.seed, ) for fold, (train, test) in tqdm(enumerate(kfold.split(df_train.values))): cv_x_train, cv_x_test = df_train.values[train], df_train.values[test] cv_disp_train, cv_disp_test = target_disp.values[ train], target_disp.values[test] cv_sign_train, cv_sign_test = target_sign.values[ train], target_sign.values[test] disp_regressor.fit(cv_x_train, cv_disp_train) cv_disp_prediction = disp_regressor.predict(cv_x_test) sign_classifier.fit(cv_x_train, cv_sign_train) cv_sign_prediction = sign_classifier.predict(cv_x_test) scores_disp.append(metrics.r2_score(cv_disp_test, cv_disp_prediction)) scores_sign.append( metrics.accuracy_score(cv_sign_test, cv_sign_prediction)) cv_predictions.append( pd.DataFrame({ 'disp': cv_disp_test, 'disp_prediction': cv_disp_prediction, 'sign': cv_sign_test, 'sign_prediction': cv_sign_prediction, 'cv_fold': fold })) predictions_df = pd.concat(cv_predictions, ignore_index=True) log.info('writing predictions from cross validation') write_data(predictions_df, predictions_path, mode='w') scores_disp = np.array(scores_disp) scores_sign = np.array(scores_sign) log.info('Cross validated R^2 scores for disp: {}'.format(scores_disp)) log.info('Mean R^2 score from CV: {:0.4f} ± {:0.4f}'.format( scores_disp.mean(), scores_disp.std())) log.info('Cross validated accuracy for the sign: {}'.format(scores_sign)) log.info('Mean accuracy from CV: {:0.4f} ± {:0.4f}'.format( scores_sign.mean(), scores_sign.std())) log.info('Building new model on complete data set...') # set random seed again to make sure different settings # for n_cross_validations don't change the final model np.random.seed(config.seed) disp_regressor.random_state = config.seed sign_classifier.random_state = config.seed disp_regressor.fit(df_train.values, target_disp.values) sign_classifier.fit(df_train.values, target_sign.values) log.info('Pickling disp model to {} ...'.format(disp_model_path)) pickle_model( disp_regressor, feature_names=list(df_train.columns), model_path=disp_model_path, label_text='disp', ) log.info('Pickling sign model to {} ...'.format(sign_model_path)) pickle_model( sign_classifier, feature_names=list(df_train.columns), model_path=sign_model_path, label_text='disp', )
X = np.array(feature_list) # Fit a per-column scaler X_scaler = StandardScaler().fit(X) # Apply the scaler to X X_train = X_scaler.transform(X) y_train = np.array(label_list) # Convert label strings to numerical encoding encoder = LabelEncoder() y_train = encoder.fit_transform(y_train) # Create classifier clf = svm.SVC(kernel='linear') # Set up 5-fold cross-validation kf = model_selection.KFold( #len(X_train), n_splits=5, shuffle=True, random_state=1) # Perform cross-validation scores = model_selection.cross_val_score(cv=kf, estimator=clf, X=X_train, y=y_train, scoring='accuracy') print('Scores: ' + str(scores)) print('Accuracy: %0.2f (+/- %0.2f)' % (scores.mean(), 2 * scores.std())) # Gather predictions predictions = model_selection.cross_val_predict(cv=kf, estimator=clf, X=X_train, y=y_train)
# irisデータセット iris = datasets.load_iris() # 交差検証の分割数 splits = 5 # 交差検証のスコア score_linier = 0 score_poly = 0 score_rbf = 0 score_kneighbors = 0 score_dtree = 0 score_randomfr = 0 # 訓練データとテストデータに分割 kf = model_selection.KFold(n_splits=splits).split(iris.data) for train_idx, test_idx in kf: train_data = iris.data[train_idx] train_target = iris.target[train_idx] test_data = iris.data[test_idx] test_target = iris.target[test_idx] # モデルを学習 clf1 = svm.LinearSVC(max_iter=10000) clf1.fit(train_data, train_target) clf2 = svm.SVC(kernel='poly', degree=3, gamma='scale') clf2.fit(train_data, train_target) clf3 = svm.SVC(kernel='rbf', gamma='scale') clf3.fit(train_data, train_target) clf4 = neighbors.KNeighborsClassifier(n_neighbors=6) clf4.fit(train_data, train_target)
############################### # blank list to store results print("\n*** Cross Validation Init ***") xvModNames = [] xvAccuracy = [] xvSDScores = [] print("Done ...") # cross validation from sklearn import model_selection print("\n*** Cross Validation ***") # iterate through the lModels for vModelName, oModelObj in lModels: # select xv folds kfold = model_selection.KFold(n_splits=10, shuffle=True, random_state=707) # actual corss validation cvAccuracy = cross_val_score(oModelObj, X, y, cv=kfold, scoring='accuracy') # prints result of cross val ... scores count = lfold splits print(vModelName, ": ", cvAccuracy) # update lists for future use xvModNames.append(vModelName) xvAccuracy.append(cvAccuracy.mean()) xvSDScores.append(cvAccuracy.std()) # cross val summary print("\n*** Cross Validation Summary ***") # header msg = "%10s: %10s %8s" % ("Model ", "xvAccuracy", "xvStdDev") print(msg) # for each model
l2_reg=3.535679697949907e-05, learning_rate=0.0008170485394812195, representation_name='acsf', representation_params=acsf_params, tensorboard=True, store_frequency=25, hidden_layer_sizes=(15, 88)) estimator.set_properties(ene_isopent) estimator.generate_representation(xyz_isopent, zs_isopent, method="fortran") # Training the model on 3 folds of n samples for n in n_samples: cv_idx = idx_train[:n] splitter = modsel.KFold(n_splits=3, random_state=42, shuffle=True) indices = splitter.split(cv_idx) scores_per_fold = [] traj_scores_per_fold = [] for item in indices: idx_train_fold = cv_idx[item[0]] idx_test_fold = cv_idx[item[1]] estimator.fit(idx_train_fold) # Scoring the model score = estimator.score(idx_test_fold) traj_score = estimator.score(idx_test) scores_per_fold.append(score)
attributeNames = [name[0] for name in mat_data['attributeNames'].squeeze()] classNames = [name[0] for name in mat_data['classNames'].squeeze()] N, M = X.shape C = len(classNames) # Parameters for neural network classifier n_hidden_units = 4 # number of hidden units n_train = 2 # number of networks trained in each k-fold # These parameters are usually adjusted to: (1) data specifics, (2) computational constraints learning_goal = 2.0 # stop criterion 1 (train mse to be reached) max_epochs = 200 # stop criterion 2 (max epochs in training) # K-fold CrossValidation (4 folds here to speed up this example) K = 4 CV = model_selection.KFold(K, shuffle=True) # Variable for classification error errors = np.zeros(K) * np.nan error_hist = np.zeros((max_epochs, K)) * np.nan bestnet = list() k = 0 for train_index, test_index in CV.split(X, y): print('\nCrossvalidation fold: {0}/{1}'.format(k + 1, K)) # extract training and test set for current CV fold X_train = X[train_index, :] y_train = y[train_index, :] X_test = X[test_index, :] y_test = y[test_index, :]
# train = train[predict_col] # train = scipy.sparse.hstack([scipy.sparse.csr_matrix(train), desc_train, title_train]) test_x = test[predict_col] test_x = scipy.sparse.hstack( [scipy.sparse.csr_matrix(test_x), desc_test, title_test]) test_x = test_x.tocsr() train_y = train["deal_probability"] train_x = train[predict_col] train_x = scipy.sparse.hstack( [scipy.sparse.csr_matrix(train_x), desc_train, title_train]) train_x = train_x.tocsr() timer.time("prepare train in ") split_num = 4 skf = model_selection.KFold(n_splits=split_num, shuffle=False) lgb = pocket_lgb.GoldenLgb() total_score = 0 models = [] for train_index, test_index in skf.split(train): X_train, X_test = train_x[train_index], train_x[test_index] y_train, y_test = train_y.iloc[train_index], train_y.iloc[test_index] model = lgb.do_train_avito(X_train, X_test, y_train, y_test, lgb_col) score = model.best_score["valid_0"]["rmse"] total_score += score models.append(model) lgb.show_feature_importance(models[0]) print("average score= ", total_score / split_num)
raw_X = combineSeqData(Data) # 原始数据 lbp_X = combineLBPSeqData(Data) # 使用LBP进行特征提取后的数据 y = np.array(Data['Face'].values) # pca pca = PCA(n_components=76, svd_solver='auto', whiten=True).fit(raw_X) pca_X = pca.transform(raw_X) # 建立模型(决策树/随机森林) clf0 = DecisionTreeClassifier(max_depth=5, min_samples_leaf=1) clf1 = RandomForestClassifier(max_depth=6) num_folds = 10 scoring = 'accuracy' for name, data in (["RAW", raw_X], ["PCA", pca_X], ["LBP", lbp_X]): kfold = model_selection.KFold(n_splits=num_folds) cv_results0 = model_selection.cross_val_score(clf0, data, y, cv=kfold, scoring=scoring) # 决策树 cv_results1 = model_selection.cross_val_score(clf1, data, y, cv=kfold, scoring=scoring) # 随机森林 msg0 = "%s DT: %f (%f)" % (name, cv_results0.mean(), cv_results0.std()) print(msg0) msg1 = "%s RF: %f (%f)" % (name, cv_results1.mean(), cv_results1.std()) print(msg1)
def mltest(self, freqs): if freqs[0] is not "" and freqs[1] is not "" and freqs[2] is not "": print(freqs) q = 'select tone1, tone2, tone3, tone_key from tones' df = pd.read_sql(q, self.con) df.head() df_size = df.size df_shape = df.shape df_columns = list(df.columns) df_targetname = df[df.columns[-1]].name df_featurenames = df_columns[ 1:-1] # select all columns till last column df_Xfeatures = df.iloc[:, 1:-1] df_Ylabels = df[df.columns[-1]] # Model Building X = df_Xfeatures Y = df_Ylabels seed = 528 # prepare models models = [] models.append(('K-Neighbours', KNeighborsClassifier())) models.append(('Classification and Regression Tree', DecisionTreeClassifier())) models.append(('Gaussian Naive Bayes', GaussianNB())) models.append(('SVC', svm.SVC())) X_train, X_test, y_train, y_test = model_selection.train_test_split( X, Y, test_size=0.1, random_state=42) # clf = svm.SVC() # clf.fit(X, Y) # print(X_test) # evaluate each model in turn train_size = 500 results = [] names = [] allmodels = [] scoring = 'accuracy' for name, model in models: print(name + " Prediction: ") model.fit(X, Y) # print(model.predict([[1.261802575107296, 1.497854077253219]])) ##Major # print(model.predict([[1.1888412017167382, 1.497854077253219]])) ##Minor # print(model.predict([[1.6823104693140793, 10.086642599277978]])) ##Minor # print(model.predict([[2.3776824034334765, 5.9957081545064375]])) ##Minor # print(model.predict([[1.259090909090909, 11.986363636363636]])) ##Major # print(model.predict([[1.5884476534296028, 4.759927797833935]])) ##Major # print(model.predict([[1.1888412017167382, 2.0]])) ##Minor user_in_processed = float(freqs[1]) / float(freqs[0]), float( freqs[2]) / float(freqs[0]) predictions = (model.predict([[ float(freqs[1]) / float(freqs[0]), float(freqs[2]) / float(freqs[0]) ]])) predict_text = [i.strip('[]') for i in predictions] ratios = float(freqs[0]) / float(freqs[0]), float( freqs[1]) / float(freqs[0]), float(freqs[2]) / float( freqs[0]) kfold = model_selection.KFold(n_splits=10, random_state=seed) cv_results = model_selection.cross_val_score(model, X, Y, cv=kfold, scoring=scoring) results.append(cv_results) names.append(name) msg = "%s: %f (%f) %s %s" % (name, cv_results.mean(), cv_results.std(), predict_text, ratios) allmodels.append(msg) model_results = results model_names = names print("Name: ") print(name) print("Message: ") print(msg) print("Results: ") print(cv_results) print("Results: ") print(*results, sep="\n") print("Models: ") print(*allmodels, sep="\n") print(df_targetname) return render_template('details.html', df_size=df_size, df_shape=df_shape, df_columns=df_columns, df_targetname=df_targetname, model_results=allmodels, model_names=names, dfplot=df) else: return render_template('index_error.html')
"num_leaves" : 30, "learning_rate" : 0.01, "bagging_fraction" : 0.7, "feature_fraction" : 0.7, "bagging_frequency" : 5, "bagging_seed" : 2018, "verbosity" : -1 } lgtrain = lgb.Dataset(train_X, label=train_y) lgval = lgb.Dataset(val_X, label=val_y) evals_result = {} model = lgb.train(params, lgtrain, 1000, valid_sets=[lgval], early_stopping_rounds=100, verbose_eval=200, evals_result=evals_result) pred_test_y = model.predict(test_X, num_iteration=model.best_iteration) return pred_test_y, model, evals_result kf = model_selection.KFold(n_splits=5, shuffle=True, random_state=2017) pred_test_full = 0 for dev_index, val_index in kf.split(X_train): dev_X, val_X = X_train.loc[dev_index,:], X_train.loc[val_index,:] dev_y, val_y = y_train[dev_index], y_train[val_index] pred_test, model, evals_result = run_lgb(dev_X, dev_y, val_X, val_y, X_test) pred_test_full += pred_test pred_test_full /= 5. pred_test_full = np.expm1(pred_test_full) sub_df = pd.DataFrame({"ID":test_df["ID"].values}) sub_df["target"] = pred_test_full sub_df.to_csv("baseline_lgb_pca.csv", index=False)
import pandas as pd from sklearn import model_selection if __name__ == "__main__": df = pd.read_csv("../input/train.csv") df["kfold"] = -1 df = df.sample(frac=1).reset_index(drop=True) kf = model_selection.KFold(n_splits=5) for fold, (train_idx, val_idx) in enumerate(kf.split(X=df)): print(len(train_idx), len(val_idx)) df.loc[val_idx, 'kfold'] = fold df.to_csv("../input/train_folds.csv", index=False)
min_weight_fraction_leaf=0.0, #max_features=None, # number of features to consider when looking for the best split; None: max_features=n_features max_features="sqrt", max_leaf_nodes=None, # None: unlimited number of leaf nodes bootstrap=True, oob_score=True, # estimate Out-of-Bag Cross Entropy n_jobs=multiprocessing.cpu_count() - 4, # paralellize over all CPU cores minus 4 #class_weight=None, # our classes are skewed, but but too skewed #class_weight={0:0.2,1:0.8}, #class_weight={0:0.4,1:0.6}, class_weight={0:0.6,1:0.4}, random_state=RANDOM_SEED, verbose=0, warm_start=False) kfold = model_selection.KFold(n_splits=5, random_state=RANDOM_SEED) eval_standard = ['accuracy', 'recall_macro', 'precision_macro', 'f1_macro'] results = [] for scoring in eval_standard: cv_results = model_selection.cross_val_score(model, X_train, y_train, scoring=scoring, cv=kfold) results.append(cv_results) msg = "%s: %f (%f)" % (scoring, cv_results.mean(), cv_results.std()) print(msg) # Make predictions on validation dataset test_df = pandas.read_csv(test_fullpath, sep=',', na_values='NA',
import pandas as pd import numpy as np import sklearn.linear_model as lm from sklearn import model_selection from main import * feature = [0, 2, 4, 6, 7, 8] target = [12] X = data[:, feature] y = data[:, target] N, M = X.shape K = 10 result_lambda = np.empty(K) CV = model_selection.KFold(K, shuffle=True, random_state=1) f = 0 y = y.squeeze() fold_size = np.empty(K) lambda_interval = np.logspace(-20, 10, 100) train_error_rate = np.zeros(len(lambda_interval)) test_error_rate = np.zeros(len(lambda_interval)) genErrors = dict() trainErrors = dict() for i in range(0, len(lambda_interval)): genErrors[i] = [] trainErrors[i] = [] for train_index, val_index in CV.split(X, y):