def test_column_transformer_sparse_stacking(): X_array = np.array([[0, 1, 2], [2, 4, 6]]).T col_trans = ColumnTransformer([('trans1', Trans(), [0]), ('trans2', SparseMatrixTrans(), 1)]) col_trans.fit(X_array) X_trans = col_trans.transform(X_array) assert_true(sparse.issparse(X_trans)) assert_equal(X_trans.shape, (X_trans.shape[0], X_trans.shape[0] + 1)) assert_array_equal(X_trans.toarray()[:, 1:], np.eye(X_trans.shape[0]))
def test_column_transformer_sparse_stacking(): X_array = np.array([[0, 1, 2], [2, 4, 6]]).T col_trans = ColumnTransformer([('trans1', Trans(), [0]), ('trans2', SparseMatrixTrans(), 1)], sparse_threshold=0.8) col_trans.fit(X_array) X_trans = col_trans.transform(X_array) assert sparse.issparse(X_trans) assert_equal(X_trans.shape, (X_trans.shape[0], X_trans.shape[0] + 1)) assert_array_equal(X_trans.toarray()[:, 1:], np.eye(X_trans.shape[0])) assert len(col_trans.transformers_) == 2 assert col_trans.transformers_[-1][0] != 'remainder' col_trans = ColumnTransformer([('trans1', Trans(), [0]), ('trans2', SparseMatrixTrans(), 1)], sparse_threshold=0.1) col_trans.fit(X_array) X_trans = col_trans.transform(X_array) assert not sparse.issparse(X_trans) assert X_trans.shape == (X_trans.shape[0], X_trans.shape[0] + 1) assert_array_equal(X_trans[:, 1:], np.eye(X_trans.shape[0]))
############################################################################### # We will perform a 10-fold cross-validation and train the neural-network with # the two different strategies previously presented. from sklearn.model_selection import StratifiedKFold skf = StratifiedKFold(n_splits=10) cv_results_imbalanced = [] cv_time_imbalanced = [] cv_results_balanced = [] cv_time_balanced = [] for train_idx, valid_idx in skf.split(X_train, y_train): X_local_train = preprocessor.fit_transform(X_train.iloc[train_idx]) y_local_train = y_train.iloc[train_idx].values.ravel() X_local_test = preprocessor.transform(X_train.iloc[valid_idx]) y_local_test = y_train.iloc[valid_idx].values.ravel() elapsed_time, roc_auc = fit_predict_imbalanced_model( X_local_train, y_local_train, X_local_test, y_local_test) cv_time_imbalanced.append(elapsed_time) cv_results_imbalanced.append(roc_auc) elapsed_time, roc_auc = fit_predict_balanced_model(X_local_train, y_local_train, X_local_test, y_local_test) cv_time_balanced.append(elapsed_time) cv_results_balanced.append(roc_auc) ###############################################################################
def analyze_logistic(X, y, model, scale_columns, analyze_params=False, balance_outcomes=False): """ Function for doing analysis of logistic regression. Plots cumulative gain, confusion matrix and grid search of optimal learning rate/epochs in SGD with k-fold CV (optional). Performs scaling of all continuous features in the data set. Inputs: - X: design matrix, shape (n, p) - y: targets, shape (n,) - scale_columns: list of indices of which columns to MinMax scale - analyze_params: boolean, option to perform grid search of learning rate and n_epochs in SGD - balance_outcomes: boolean, option to balance training data in case of skewed classes """ #split data in train/validate and test X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.1) #balance training set such that outcomes are 50/50 in training data if balance_outcomes: non_default_inds = np.where(y_train_val == 0)[0] default_inds = np.where(y_train_val == 1)[0] remove_size = len(non_default_inds) - len(default_inds) remove_inds = np.random.choice(non_default_inds, size=remove_size, replace=False) X_train_val = np.delete(X, remove_inds, axis=0) y_train_val = np.delete(y, remove_inds, axis=0) #end if #scale continuous features minmaxscaler = MinMaxScaler(feature_range=(-1, 1)) scaler = ColumnTransformer(remainder='passthrough', transformers=[('minmaxscaler', minmaxscaler, scale_columns)]) #scale only test data at this point (CV scales training/validation) scaler.fit(X_train_val) X_test = scaler.transform(X_test) if analyze_params: #initialize vectors for saving results error_scores = pd.DataFrame( columns=['log eta', 'n_epochs', 'mse', 'r2', 'accuracy']) n_etas = 4 eta_vals = np.linspace(-1, -4, n_etas) n_epoch_vals = np.array([10, 100, 500, 1000]) n_epochs = len(n_epoch_vals) accuracy_scores = np.zeros((n_etas, n_epochs)) max_accuracy = 0 best_eta = 0 best_n_epochs = 0 #perform grid search of best learning rate #and number of epochs with k-fold cross-validation i = 0 for eta in eta_vals: model.set_eta(10**eta) j = 0 for epoch in n_epoch_vals: model.set_n_epochs(epoch) #perform cross validation mse, r2, accuracy = CV(X_train_val, y_train_val, model) accuracy_scores[i, j] = accuracy error_scores = error_scores.append( { 'log eta': eta, 'n_epochs': epoch, 'mse': mse, 'r2': r2, 'accuracy': accuracy }, ignore_index=True) #check if current configuration is better if accuracy > max_accuracy: max_accuracy = accuracy best_eta = eta best_n_epochs = epoch j += 1 #end for epoch i += 1 #end for eta #set optimal model parameters model.set_eta(10**best_eta) model.set_n_epochs(best_n_epochs) #plot heatmap of grid search acc_table = pd.pivot_table(error_scores, values='accuracy', index=['log eta'], columns='n_epochs') idx_i = np.where(acc_table == max_accuracy)[0] idx_j = np.where(acc_table == max_accuracy)[1] fig = plt.figure() ax = sns.heatmap(acc_table, annot=True, fmt='.2g', cbar=True, linewidths=1, linecolor='white', cbar_kws={'label': 'Accuracy'}) ax.add_patch( Rectangle((idx_j, idx_i), 1, 1, fill=False, edgecolor='red', lw=2)) ax.set_xlabel('Number of epochs') ax.set_ylabel(r'log$_{10}$ of Learning rate') bottom, top = ax.get_ylim() ax.set_ylim(bottom + 0.5, top - 0.5) plt.show() #end if #scale training data X_train_val = scaler.transform(X_train_val) #pylearn model model.fit(X_train_val, y_train_val) pred_train = model.predict(X_train_val) pred_test = model.predict(X_test) #sklearn model clf = linear_model.LogisticRegressionCV() clf.fit(X_train_val, y_train_val) pred_skl = clf.predict(X_test) #get accuracy scores accuracy_on_test = accuracy_score(y_test, pred_test) accuracy_on_train = accuracy_score(y_train_val, pred_train) accuracy_skl = accuracy_score(y_test, pred_skl) #predict pred_train_prob = model.predict(X_train_val, probability=True) pred_test_prob = model.predict(X_test, probability=True) #get area ratio and plot cumulaive gain area_ratio_train = cumulative_gain_area_ratio(y_train_val, pred_train_prob, title='Training results') area_ratio_test = cumulative_gain_area_ratio(y_test, pred_test_prob, title=None) plt.show() #plot confusion matrix ax1 = plot_confusion_matrix(y_test, pred_test, normalize=True, cmap='Blues', title=' ') ax2 = plot_confusion_matrix(y_train_val, pred_train, normalize=True, cmap='Blues', title='Training data') bottom, top = ax1.get_ylim() ax1.set_ylim(bottom + 0.5, top - 0.5) ax2.set_ylim(bottom + 0.5, top - 0.5) plt.show() #print some stats print('===accuracy and area ratio stats===') print('accuracy on test:', accuracy_on_test) print('accuracy on train:', accuracy_on_train) print('accuracy skl:', accuracy_skl) print('area ratio train:', area_ratio_train) print('area ratio test:', area_ratio_test) if analyze_params: print('===grid search stats===') print('max accuracy:', max_accuracy) print('eta:', best_eta) print('n_epochs:', best_n_epochs)
cv=8, scoring='accuracy', n_jobs=-1) results = results.append(pd.Series({ 'Model': type(model).__name__, 'Train Score': model.fit(train_features, train_labels).score(train_features, train_labels), 'Mean Val Score': cv_results.mean() }), ignore_index=True) print(results) best_model = models[results['Mean Val Score'].argmax()] print('Best Model selected:', type(best_model).__name__) best_model.fit(train_features, train_labels) predictions = best_model.predict(train_features) plot_confusion_matrix(best_model, train_features, train_labels) plt.show() print(classification_report(train_labels, predictions)) incorrect_predictions = train[predictions != train_labels] test_features = transformer.transform(test[['text1']]).toarray() predictions = best_model.predict(test_features).astype('int64') output = pd.DataFrame({'id': test.iloc[:, 0], 'target': predictions}) output.to_csv('output.csv', index=False)
ct = ColumnTransformer([("scaling", StandardScaler(), ['age', 'hours-per-week']), ("onehot", OneHotEncoder(sparse=False), ['workclass', 'education', 'gender', 'occupation'])]) from sklearn.linear_model import LogisticRegression from sklearn.model_selection import train_test_split # get all columns apart from income for the features data_features = data.drop("income", axis=1) # split dataframe and income X_train, X_test, y_train, y_test = train_test_split(data_features, data.income, random_state=0) ct.fit(X_train) X_train_trans = ct.transform(X_train) print(X_train_trans.shape) # Result: # (24420, 44) logreg = LogisticRegression(solver='lbfgs', max_iter=5000) logreg.fit(X_train_trans, y_train) X_test_trans = ct.transform(X_test) print("\nTest score: {:.2f}".format(logreg.score(X_test_trans, y_test))) # Result: # Test score: 0.81 print("\nct.named_transformers_.onehot:\n", ct.named_transformers_.onehot) # Result: # ct.named_transformers_.onehot:
# The problem is solves with the help of Dummy Encoding onehotencoder1 = OneHotEncoder(categorical_features = [4, 5]) X = onehotencoder1.fit_transform(X).toarray() # Splitting the dataset into the Training set and Test set from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0) print(X_train[0]) # Feature Scaling from sklearn.preprocessing import StandardScaler from sklearn.compose import ColumnTransformer ct = ColumnTransformer([('one', StandardScaler(), [17, 18, 19, 20])], remainder='passthrough') X_train = ct.fit_transform(X_train) X_test = ct.transform(X_test) # Fitting Random Forest Classification to the Training set from sklearn.ensemble import RandomForestClassifier classifier = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0) classifier.fit(X_train, y_train) # Predicting the Test set results y_pred = classifier.predict(X_test) # y_fnl = [] # for i in range(0, len(y_pred)): # y_fnl.append(int(y_pred[i])) # print(y_pred[i],y_test[i]) #Making the Confusion Matrix
pred_x = pred_x.drop('Instance', axis=1) # print(pred_x) # create imputer for missing values with different strategies for numerical vs categorical data, # numerical takes the mean, categorical the mode ct = ColumnTransformer(transformers=[('cat_imp', SimpleImputer(strategy='most_frequent'), [1, 3, 5, 6, 7, 8]), ('num_imp', SimpleImputer(strategy='median'), [0, 2, 4, 9])], remainder='passthrough') # split data into train and test sets X_train, X_test, y_train, y_test = train_test_split(training_x, training_y, shuffle=True, test_size=0.2) # apply imputer to data ct.fit(X_train, y_train) pred_x = ct.transform(pred_x) X_train = ct.transform(X_train) X_test = ct.transform(X_test) # create catboost pool data structures specifying categorical features for both train and test data pool_train = Pool(X_train, label=y_train, cat_features=[4, 5, 6, 7, 8, 9]) pool_test = Pool(X_test, label=y_test, cat_features=[4, 5, 6, 7, 8, 9]) print("Starting model creation") # create catboostmodel model = CatBoostRegressor(cat_features=[4, 5, 6, 7, 8, 9], eval_metric='RMSE', od_type='Iter', od_wait=10, one_hot_max_size=40, task_type="GPU", devices='0', use_best_model=True,iterations=10000, learning_rate=0.01, depth=10, l2_leaf_reg=3, random_strength=4, bagging_temperature=10, border_count=255) #fit model to data model.fit(pool_train, eval_set=pool_test, use_best_model=True)
def test_feature_name_validation(): """Tests if the proper warning/error is raised if the columns do not match during fit and transform.""" pd = pytest.importorskip("pandas") X = np.ones(shape=(3, 2)) X_extra = np.ones(shape=(3, 3)) df = pd.DataFrame(X, columns=['a', 'b']) df_extra = pd.DataFrame(X_extra, columns=['a', 'b', 'c']) tf = ColumnTransformer([('bycol', Trans(), ['a', 'b'])]) tf.fit(df) msg = ("Given feature/column names or counts do not match the ones for " "the data given during fit.") with pytest.warns(DeprecationWarning, match=msg): tf.transform(df_extra) tf = ColumnTransformer([('bycol', Trans(), [0])]) tf.fit(df) with pytest.warns(DeprecationWarning, match=msg): tf.transform(X_extra) with warnings.catch_warnings(record=True) as warns: tf.transform(X) assert not warns tf = ColumnTransformer([('bycol', Trans(), ['a'])], remainder=Trans()) tf.fit(df) with pytest.warns(DeprecationWarning, match=msg): tf.transform(df_extra) tf = ColumnTransformer([('bycol', Trans(), [0, -1])]) tf.fit(df) msg = "At least one negative column was used to" with pytest.raises(RuntimeError, match=msg): tf.transform(df_extra) tf = ColumnTransformer([('bycol', Trans(), slice(-1, -3, -1))]) tf.fit(df) with pytest.raises(RuntimeError, match=msg): tf.transform(df_extra) with warnings.catch_warnings(record=True) as warns: tf.transform(df) assert not warns
def main(train_X, test_X, train_y, test_y, conf1, conf2, roc_path): np.random.RandomState(414) warnings.filterwarnings(action='ignore', category=FitFailedWarning) # import the already split datasets X_train = pd.read_csv(train_X, index_col=0) y_train = pd.read_csv(train_y, index_col=0) X_test = pd.read_csv(test_X, index_col=0) y_test = pd.read_csv(test_y, index_col=0) # Test that X_train has more rows the X_test try: assert (X_train.shape[0] > X_test.shape[0]) except Exception as bad_size: print( "X_train should have more rows than X_test.\nDid you put them in the wrong order?" ) # Make validation set X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.2, random_state=414) numeric_features = ["age", "result"] one_hot_features = [ "gender", "ethnicity", "jaundice", "country_of_res", "used_app_before", "age_desc", "relation", "Class/ASD" ] other_columns = list(X_train.columns[0:10]) preprocessor = ColumnTransformer( sparse_threshold=0, transformers=[("scale", StandardScaler(), numeric_features), ("one_hot", OneHotEncoder(drop=None, handle_unknown="ignore"), one_hot_features)]) X_train_temp = pd.DataFrame( preprocessor.fit_transform(X_train), index=X_train.index, columns=(numeric_features + list(preprocessor.named_transformers_["one_hot"]. get_feature_names(one_hot_features)))) X_test_temp = pd.DataFrame(preprocessor.transform(X_test), index=X_test.index, columns=X_train_temp.columns) X_valid_temp = pd.DataFrame(preprocessor.transform(X_valid), index=X_valid.index, columns=X_train_temp.columns) X_train = X_train_temp.join(X_train[other_columns]) X_test = X_test_temp.join(X_test[other_columns]) X_valid = X_valid_temp.join(X_valid[other_columns]) le = LabelEncoder() y_train = le.fit_transform(y_train.to_numpy().ravel()) y_test = le.transform(y_test.to_numpy().ravel()) y_valid = le.transform(y_valid.to_numpy().ravel()) ## Trying Gridsearch on different models to find best ## Initialize models # lr = LogisticRegression() dt = DecisionTreeClassifier(random_state=414) rf = RandomForestClassifier(random_state=414) svm = SVC(random_state=414) knn = KNeighborsClassifier() # Make list for models and a list to store their values estimators = [dt, rf, svm, knn] best_parameters = [] best_precision_scores = [] # Make list of dictionaries for parameters params = [ #{'C':[0.000001, 0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000], #'penalty': ['l1', 'l2']}, { 'max_depth': [1, 5, 10, 15, 20, 25, None], 'max_features': [3, 5, 10, 15, 20, 50, None] }, { 'min_impurity_decrease': [0, 0.25, 0.5], 'max_features': [3, 5, 10, 20, 50, 'auto'] }, { 'C': [0.000001, 0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000], 'gamma': [0.000001, 0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10, 100] }, { 'n_neighbors': [2, 5, 10, 15, 20, 50, 100], 'algorithm': ['auto', 'brute'] } ] # Run for loop to best parameters for each model # Scoring = recall to reduce false positives for i in range(len(estimators)): search = GridSearchCV(estimator=estimators[i], param_grid=params[i], cv=10, n_jobs=-1, scoring='recall') search_object = search.fit(X_train, y_train) # Store the output on each iteration best_parameters.append(search_object.best_params_) best_precision_scores.append(search_object.best_score_) best_parameters[np.argmax(best_precision_scores)] # the best precision score comes from a decision tree classifier with max_depth=15 and max_features=50 # and precision = 0.46 dt = DecisionTreeClassifier(max_depth=15, max_features=50) dt.fit(X_train, y_train).score(X_train, y_train) # It gets almost perfect on the train set dt.score(X_valid, y_valid) # and ~81% on the validation set prelim_matrix = pd.DataFrame(confusion_matrix(y_valid, dt.predict(X_valid))) preliminary_matrix = prelim_matrix.rename(columns={ 0: "Predicted no autism", 1: 'Predicted autism' }, index={ 0: "Does not have autism", 1: 'Has autism' }) preliminary_matrix.to_csv(conf1) #print(classification_report(y_test, dt.predict(X_test))) ## Subset just the questions: questions = [ 'A1_Score', 'A2_Score', 'A3_Score', 'A4_Score', 'A5_Score', 'A6_Score', 'A7_Score', 'A8_Score', 'A9_Score', 'A10_Score' ] questions_train_df = X_train[questions] questions_valid_df = X_valid[questions] questions_test_df = X_test[questions] # Attribution: Varada Kolhatkar class ForwardSelection: def __init__(self, model, min_features=None, max_features=None, scoring=None, cv=None): """ Initialize a Forward selection model """ self.max_features = max_features if min_features is None: self.min_features = 1 else: self.min_features = min_features self.model = model self.scoring = scoring self.cv = cv self.ftr_ = [] return def fit(self, X, y): """ Fit a forward selection model """ error = np.inf best = None feature_index = list(range(0, (X.shape[1]))) errors = [] X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=514) X_temp = X_train while error > 0.0: if best is not None: if best not in feature_index: del feature_index[-2] break feature_index.remove(best) for i in feature_index: self.model.fit(X_temp[:, self.ftr_ + [i]], y_train) temp_error = 1 - np.mean( cross_val_score( self.model, X[:, self.ftr_ + [i]], y, scoring='f1')) if temp_error < error: error = temp_error best = i errors.append(round(error, 3)) if len(errors) > 2: if errors[-1] >= errors[-2]: break if self.max_features is not None: if len(errors) > self.max_features: break self.ftr_.append(best) def transform(self, X, y=None): """ Transform a test set """ return X[:, self.ftr_] fs = ForwardSelection(DecisionTreeClassifier(), max_features=None) fs.fit(questions_train_df.to_numpy(), y_train) fs.ftr_ # No single one question is better than any other one question so forward selection won't work # Or it just won't work with a decision tree rfe = RFE(DecisionTreeClassifier(), n_features_to_select=5) rfe.fit(questions_train_df, y_train) # The top 5 questions: top_five = np.where(rfe.ranking_ == 1)[0] X_train_best_5 = questions_train_df.to_numpy()[:, top_five] X_test_best_5 = questions_test_df.to_numpy()[:, top_five] X_valid_best_5 = questions_valid_df.to_numpy()[:, top_five] dt2 = DecisionTreeClassifier() dt2.fit(X_train_best_5, y_train) pd.DataFrame(confusion_matrix(y_valid, dt2.predict(X_valid_best_5))) # Using just the top 5 questions gets a much worse result than using all the features # Try all questions: dt3 = DecisionTreeClassifier() dt3.fit(questions_train_df, y_train) conf_matrix = pd.DataFrame(confusion_matrix(y_test, dt.predict(X_test))) final_matrix = conf_matrix.rename(columns={ 0: "Predicted no autism", 1: 'Predicted autism' }, index={ 0: "Does not have autism", 1: 'Has autism' }) final_matrix.to_csv(conf2) # ROC curve fpr, tpr, _ = roc_curve(y_test, dt.predict_proba(X_test)[:, 1]) roc_df = pd.DataFrame({"fpr": fpr, "tpr": tpr}) line_df = pd.DataFrame({"start": [0, 1], "end": [0, 1]}) roc = alt.Chart(roc_df).mark_line().encode(x=alt.X("fpr:Q"), y=alt.Y("tpr:Q")) line = alt.Chart(line_df).mark_line( strokeDash=[5, 5], color="orange").encode( x=alt.X("start:Q", axis=alt.Axis(title="False Positive Rate")), y=alt.Y("end:Q", axis=alt.Axis(title="True Positive Rate"))) chart = (roc + line).configure_axis(titleFontSize=20).properties( title="ROC Curve").configure_title(fontSize=20) chart chart.save(roc_path)
columns_to_transform = ['second_edu_speci', 'degree_title'] transformer = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), columns_to_transform)], remainder='passthrough') X = transformer.fit_transform(dataset.drop(['placement'], axis=1)) y = dataset['placement'] #Splitting the dataset X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20) #Scaling the features scaler = ColumnTransformer(transformers=[('scaler', StandardScaler(), [7, 9, 11, 13, 15])], remainder='passthrough') X_train = scaler.fit_transform(X_train) X_test = scaler.transform(X_test) #Evaluation metrics from sklearn.model_selection import GridSearchCV from sklearn.metrics import classification_report from sklearn.model_selection import cross_val_score #Implementing different models #Naive Bayes - Mean score = 0.80% - Report score = 81% from sklearn.naive_bayes import GaussianNB naive = GaussianNB() naive_score = cross_val_score(naive, X_train, y_train, cv=10) print(np.mean(naive_score))
naive_sin = [] naive_bin = [] nb_con = GaussianNB() nb_sin = GaussianNB() nb_bin = GaussianNB() for i in semillas: X_train_con, X_test_con, Y_train_con, Y_test_con = train_test_split( X, Y, random_state=i) X_train_sin, X_test_sin, Y_train_sin, Y_test_sin = train_test_split( X_sin, Y, random_state=i) X_train_bin, X_test_bin, Y_train_bin, Y_test_bin = train_test_split( X_dif, Y, random_state=i) X_train_con_trans = trans.fit_transform(X_train_con) X_test_con_trans = trans.transform(X_test_con) X_train_sin_trans = trans.fit_transform(X_train_sin) X_test_sin_trans = trans.transform(X_test_sin) X_train_bin_trans = trans.fit_transform(X_train_bin) X_test_bin_trans = trans.transform(X_test_bin) regre_con.fit(X_train_con_trans, Y_train_con) regresion_con.append( accuracy_score(regre_con.predict(X_test_con_trans), Y_test_con)) regre_sin.fit(X_train_sin_trans, Y_train_sin) regresion_sin.append( accuracy_score(regre_sin.predict(X_test_sin_trans), Y_test_sin)) regre_bin.fit(X_train_bin_trans, Y_train_bin) regresion_bin.append( accuracy_score(regre_bin.predict(X_test_bin_trans), Y_test_bin))
filling_indices = [ x for x in range(len(X_test)) if X_test[x, -1] != 'S' and X_test[x, -1] != 'Q' and X_test[x, -1] != 'C' ] X_test[filling_indices, -1] = most_frequent_embarked embarked_encoder = LabelEncoder() X_train[:, -1] = embarked_encoder.fit_transform(X_train[:, -1]) X_test[:, -1] = embarked_encoder.transform(X_test[:, -1]) # one hot encoding pclass ct_pclass = ColumnTransformer( [('one_hot_encoder', OneHotEncoder(categories='auto'), [0])], remainder='passthrough') X_train = ct_pclass.fit_transform(X_train) X_test = ct_pclass.transform(X_test) # skipping dummy variable trap X_train = X_train[:, 1:] X_test = X_test[:, 1:] # one hot encoding embarked ct_embarked = ColumnTransformer( [('one_hot_encoder', OneHotEncoder(categories='auto'), [7])], remainder='passthrough') X_train = ct_embarked.fit_transform(X_train) X_test = ct_embarked.transform(X_test) # skipping dummy variable trap X_train = X_train[:, 1:] X_test = X_test[:, 1:] from sklearn.preprocessing import StandardScaler
def main(input1, input2, output): # Read wrangled csv files df_train = pd.read_csv(f"./data/{input1}") df_test = pd.read_csv(f"./data/{input2}") X_train = df_train.drop(['Approved'], 1) y_train = df_train[['Approved']] X_test = df_test.drop(['Approved'], 1) y_test = df_test[['Approved']] ## Encoding categorical variables categorical_features = [ 'Sex', 'Ethnicity', 'Married', 'BankCustomer', 'EducationLevel', 'PriorDefault', 'Employed', 'DriversLicense', 'Citizen', 'ZipCode' ] preprocessor = ColumnTransformer( transformers=[('ohe', OneHotEncoder(handle_unknown='ignore', sparse=False), categorical_features)]) X_train = pd.DataFrame(preprocessor.fit_transform(X_train)) X_test = pd.DataFrame(preprocessor.transform(X_test)) y_train = y_train.to_numpy().ravel() y_test = y_test.to_numpy().ravel() #empty dictionary to store results results_dict = {} models = { 'random forest': RandomForestClassifier(), 'xgboost': XGBClassifier(), 'lgbm': LGBMClassifier() } for model_name, model in models.items(): t = time.time() #print(model_name, ":") clf = Pipeline(steps=[('classifier', model)]) clf.fit(X_train, y_train) train_score, test_score = get_scores(clf, X_train, y_train, X_test, y_test, show=False) elapsed_time = time.time() - t results_dict[model_name] = [ round(train_score, 3), round(test_score, 3), round(elapsed_time, 4) ] model_compare_dataframe = pd.DataFrame(results_dict) model_compare_dataframe.to_csv(f'./{output}/model_compare') ### Hyper parameter optimisation for Random Forest hyper_parameters = [{ 'n_estimators': [3, 5, 10, 50, 100], 'criterion': ['gini', 'entropy'], 'max_depth': [10, 20, 50, None] }] clf = GridSearchCV(RandomForestClassifier(), hyper_parameters, cv=StratifiedKFold(n_splits=3, shuffle=True, random_state=23), verbose=0) best_model = clf.fit(X_train, y_train) # Measure accuracies train_predictions = best_model.predict(X_train) train_accuracy = accuracy_score(y_train, train_predictions) test_predictions = best_model.predict(X_test) test_accuracy = accuracy_score(y_test, test_predictions) test_recall = recall_score(y_test, test_predictions) test_precision = precision_score(y_test, test_predictions) auc_score = roc_auc_score(y_test, best_model.predict_proba(X_test)[:, 1]) accuracies_df = pd.DataFrame(index=[ 'test accuracy', 'train accuracy', 'test recall', 'test precision', 'auc score' ], data={ 'result': [ test_accuracy, train_accuracy, test_recall, test_precision, auc_score ] }) accuracies_df.to_csv(f'./{output}/accuracy_report') # plot and report confusion matrix plot_confusion_matrix(best_model, X_test, y_test) report = classification_report(y_test, test_predictions, output_dict=True) report_df = pd.DataFrame(report) report_df.to_csv(f'./{output}/classification_report') # compute and save roc curve fpr, tpr, thresholds = roc_curve(y_test, best_model.predict_proba(X_test)[:, 1]) plt.plot(fpr, tpr) plt.title('ROC report') plt.plot((0, 1), (0, 1), '--k') plt.xlabel('false positive rate') plt.ylabel('true positive rate') plt.savefig(f'./{output}/roc.png')
def main(): # Get and load data get_data() housing = load_data() # display_data(housing) # Perform and split by strata strat_train_set, strat_test_set = do_stratified_sampling(housing) # Using the training set, play with the data # play_with_data(strat_train_set.copy()) # Split data into predictors and labels housing = strat_train_set.drop("median_house_value", axis=1) housing_labels = strat_train_set["median_house_value"].copy() # Use an imputer to fill in missing values # We will fill in these values with the median imputer = SimpleImputer(strategy="median") # Get dataframe of only numerical vals housing_num = housing.drop("ocean_proximity", axis=1) # Let the imputer estimate based on the numerical housing vals imputer.fit(housing_num) # NOTE: The median of each attribute is stored in imputer.statistics_ # Use trained imputer to fill in gaps by transforming the data X = imputer.transform(housing_num) # Insert np array into pandas DataFrame housing_tr = pd.DataFrame(X, columns=housing_num.columns, index=housing_num.index) # Convert categorical attribute to numerical attribute housing_cat = housing[["ocean_proximity"]] # Use one-hot encoding instead of ordinal encoding # as the categories are not ordered. cat_encoder = OneHotEncoder() # NOTE: This gives a scipy array which stores the location # of the "hot" encoding (instead of potentially storing # many many "cold" encodings (0's)) # NOTE: Categories are stored in ordinal_encoder.categories_ housing_cat_1hot = cat_encoder.fit_transform(housing_cat) # Adding combinational attributes attr_adder = CombinedAttributesAdder(add_bedrooms_per_room=False) housing_extra_attribs = attr_adder.transform(housing.values) # Pipeline for transformations on numerical values num_pipeline = Pipeline([ ('imputer', SimpleImputer(strategy="median")), ('attribs_adder', CombinedAttributesAdder()), ('std_scaler', StandardScaler()), ]) housing_num_tr = num_pipeline.fit_transform(housing_num) # It is also possible to perform all of the above transformations # in one go num_attribs = list(housing_num) cat_attribs = ["ocean_proximity"] full_pipeline = ColumnTransformer([ ("num", num_pipeline, num_attribs), ("cat", OneHotEncoder(), cat_attribs), ]) # This is the final set of training data housing_prepared = full_pipeline.fit_transform(housing) # Fit the linear regression model on prepared data lin_reg = LinearRegression() lin_reg.fit(housing_prepared, housing_labels) # Do some testing some_data = housing.iloc[:5] some_labels = housing_labels.iloc[:5] some_data_prepared = full_pipeline.transform(some_data) print("Predictions:", lin_reg.predict(some_data_prepared)) print("Labels:", list(some_labels)) # Get metrics housing_predictions = lin_reg.predict(housing_prepared) lin_mse = mean_squared_error(housing_labels, housing_predictions) lin_rmse = np.sqrt(lin_mse) print(lin_rmse) # Due to the above results being unsatisfactory # Try a decision tree regressor tree_reg = DecisionTreeRegressor() tree_reg.fit(housing_prepared, housing_labels) # Now do some testing on the tree regression model housing_predictions = tree_reg.predict(housing_prepared) tree_mse = mean_squared_error(housing_labels, housing_predictions) tree_rmse = np.sqrt(tree_mse) print(tree_rmse) # The above testing gives no error # Cross validation is performed on 10 folds (training and validating # 10 times, choosing a different fold for validation each time # and training on the remaining fold) scores = cross_val_score(tree_reg, housing_prepared, housing_labels, scoring="neg_mean_squared_error", cv=10) # As cross validation expect to use a utility function instead of a # cost function (whereas we want to use a cost function), we must # flip the sign of the scores. tree_rmse_scores = np.sqrt(-scores) # Double check against cross validation on the linear reg. model lin_scores = cross_val_score(lin_reg, housing_prepared, housing_labels, scoring="neg_mean_squared_error", cv=10) lin_rmse_scores = np.sqrt(-lin_scores) print("TREE RSME SCORES") display_scores(tree_rmse_scores) print("LINEAR REG RMSE SCORES") display_scores(lin_rmse_scores) # This shows that the Decision Tree is overfitting # Therefore we try the Random Forest Regressor forest_reg = RandomForestRegressor() forest_reg.fit(housing_prepared, housing_labels) forest_scores = cross_val_score(forest_reg, housing_prepared, housing_labels, scoring="neg_mean_squared_error", cv=10) forest_rmse_scores = np.sqrt(-forest_scores) print("RANDOM FOREST REG RMSE SCORES") display_scores(forest_rmse_scores) # Fine-tuning by automatically searching for hyperparams # Grid indicates to try firstly all permutations of the first dict # followed by the permutations of options in the second dict. param_grid = [ { "n_estimators": [3, 10, 30], "max_features": [2, 4, 6, 8] }, { "bootstrap": [False], "n_estimators": [3, 10], "max_features": [2, 3, 4] }, ] forest_reg = RandomForestRegressor() # We use five-fold cross validation grid_search = GridSearchCV(forest_reg, param_grid, cv=5, scoring="neg_mean_squared_error", return_train_score=True) grid_search.fit(housing_prepared, housing_labels) # The best parameters are found using: print(f"Best hyperparams: {grid_search.best_params_}") # The best estimator: print(f"Best Estimator: {grid_search.best_estimator_}") # The evaluation scores: cvres = grid_search.cv_results_ for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]): print(np.sqrt(-mean_score), params) # Examine the relative importance of each attribute for accurate predictions feature_importances = grid_search.best_estimator_.feature_importances_ # Displaying the importance scores next to their attribute names extra_attribs = ["rooms_per_hhold", "pop_per_hhold", "bedrooms_per_room"] cat_encoder = full_pipeline.named_transformers_["cat"] cat_one_hot_attribs = list(cat_encoder.categories_[0]) attributes = num_attribs + extra_attribs + cat_one_hot_attribs print(sorted(zip(feature_importances, attributes), reverse=True)) # NOTE: The above may indicate which features may be dropped # Evaluation on test set # Select the best estimator found by the grid search as the final model final_model = grid_search.best_estimator_ # Separate test set into predictors and labels X_test = strat_test_set.drop("median_house_value", axis=1) y_test = strat_test_set["median_house_value"].copy() # NOTE: Only transform test data, DO NOT FIT the model on test data X_test_prepared = full_pipeline.transform(X_test) final_predictions = final_model.predict(X_test_prepared) final_mse = mean_squared_error(y_test, final_predictions) final_rmse = np.sqrt(final_mse) # Compute 95% confidence interval confidence = 0.95 squared_errors = (final_predictions - y_test)**2 np.sqrt( stats.t.interval(confidence, len(squared_errors) - 1, loc=squared_errors.mean(), scale=stats.sem(squared_errors))) # The following is inserted into our SelectImportantFeatures' # fit method, however we add it here for testing later. top_k_feature_indices = top_importances(feature_importances, 5) # New pipeline, now reducing the data's features to be # restricted to the top 5 most important features prep_and_feature_pipeline = Pipeline([ ("prep", full_pipeline), ("feature", SelectImportantFeatures(feature_importances, 5)) ]) trimmed_housing = prep_and_feature_pipeline.fit_transform(housing) # NOTE: If we were to do trimmed_housing[0:3] and # housing_prepared[0:3, top_k_feature_indices], # the output would be the same. print(trimmed_housing[0:3]) print(housing_prepared[0:3, top_k_feature_indices])
categorical_features = list(category_map.keys()) categorical_transformer = Pipeline( steps=[('imputer', SimpleImputer( strategy='median')), ('onehot', OneHotEncoder(handle_unknown='ignore'))]) preprocessor = ColumnTransformer(transformers=[( 'num', ordinal_transformer, ordinal_features), ('cat', categorical_transformer, categorical_features)]) # train an RF model print("Train random forest model") np.random.seed(0) clf = RandomForestClassifier(n_estimators=50) pipeline = Pipeline([('preprocessor', preprocessor), ('clf', clf)]) pipeline.fit(X_train, Y_train) print("Creating an explainer") explainer = alibi.explainers.AnchorTabular( predictor=lambda x: clf.predict(preprocessor.transform(x)), feature_names=feature_names, categorical_names=category_map) explainer.fit(X_train) explainer.predict_fn = None # Clear explainer predict_fn as its a lambda and will be reset when loaded print("Saving individual files") with open("explainer.dill", 'wb') as f: dill.dump(explainer, f) joblib.dump(pipeline, 'model.joblib')
class RepeatingBasisFunction(TransformerMixin, BaseEstimator): """ This is a transformer for features with some form of circularity. E.g. for days of the week you might face the problem that, conceptually, day 7 is as close to day 6 as it is to day 1. While numerically their distance is different. This transformer remedies that problem. The transformer selects a column and transforms it with a given number of repeating (radial) basis functions. Radial basis functions are bell-curve shaped functions which take the original data as input. The basis functions are equally spaced over the input range. The key feature of repeating basis funtions is that they are continuous when moving from the max to the min of the input range. As a result these repeating basis functions can capture how close each datapoint is to the center of each repeating basis function, even when the input data has a circular nature. :type column: int or list, default=0 :param column: Indexes the data on its second axis. Integers are interpreted as positional columns, while strings can reference DataFrame columns by name. :type remainder: {'drop', 'passthrough'}, default="drop" :param remainder: By default, only the specified column is transformed, and the non-specified columns are dropped. (default of ``'drop'``). By specifying ``remainder='passthrough'``, all remaining columns will be automatically passed through. This subset of columns is concatenated with the output of the transformer. :type n_periods: int, default=12 :param n_periods: number of basis functions to create, i.e., the number of columns that will exit the transformer. :type input_range: tuple or None, default=None :param input_range: the values at which the data repeats itself. For example, for days of the week this is (1,7). If input_range=None it is inferred from the training data. """ def __init__(self, column=0, remainder="drop", n_periods=12, input_range=None): self.column = column self.remainder = remainder self.n_periods = n_periods self.input_range = input_range self.pipeline = None def fit(self, X, y=None): self.pipeline = ColumnTransformer( [( "repeatingbasis", _RepeatingBasisFunction(n_periods=self.n_periods, input_range=self.input_range), [self.column], )], remainder=self.remainder, ) self.pipeline.fit(X, y) return self def transform(self, X): check_is_fitted(self, ["pipeline"]) return self.pipeline.transform(X)
svm_rmse_scores = np.sqrt(-svm_scores) print("\nSVM Regression scores (train set): \n") display_scores(svm_rmse_scores) tree_rmse_scores = np.sqrt(-tree_scores) print("\nDT Regression scores (train set): \n") display_scores(tree_rmse_scores) forest_rmse_scores = np.sqrt(-forest_scores) print("\nRF Regression scores (train set): \n") display_scores(forest_rmse_scores) #2-5 X_test = strat_test_set.drop("burned_area", axis=1) y_test = strat_test_set["burned_area"].copy() X_test_prepared = full_pipeline.transform(X_test) sgd_scores = cross_val_score(sgd_reg, X_test_prepared, y_test, scoring="neg_mean_squared_error", cv=10) sgd_rmse_scores = np.sqrt(-sgd_scores) print("\nSGD Regression scores (test set): \n") display_scores(sgd_rmse_scores) svm_scores = cross_val_score(svm_reg, X_test_prepared, y_test, scoring="neg_mean_squared_error", cv=10)
# Encoding for the Gender Column from sklearn.preprocessing import LabelEncoder le = LabelEncoder() X_train[:, 1] = le.fit_transform(X_train[:, 1]) X_test[:, 1] = le.transform(X_test[:, 1]) fam_test[:, 1] = le.transform(fam_test[:, 1]) # Encoding X categorical data + HotEncoding from sklearn.preprocessing import OneHotEncoder from sklearn.compose import ColumnTransformer ct = ColumnTransformer([('encoder', OneHotEncoder(), [-1])], remainder='passthrough') X_train = np.array(ct.fit_transform(X_train), dtype=np.float) X_test = np.array(ct.transform(X_test), dtype=np.float) fam_test = np.array(ct.transform(fam_test), dtype=np.float) # Avoiding Dummy Variable Trap X_train = X_train[:, 1:] X_test = X_test[:, 1:] fam_test = fam_test[:, 1:] # Feature Scaling from sklearn.preprocessing import StandardScaler sc_X = StandardScaler() X_train = sc_X.fit_transform(X_train) X_test = sc_X.transform(X_test) fam_test = sc_X.transform(fam_test)
def make_features(transformer: ColumnTransformer, df: pd.DataFrame) -> pd.DataFrame: return pd.DataFrame(transformer.transform(df))
def _preprocessor(self, x, y=None, training=False): """ Preprocess input of the network. Arguments: - x {pd.DataFrame} -- Raw input array of shape (batch_size, input_size). - y {pd.DataFrame} -- Raw target array of shape (batch_size, 1). - training {boolean} -- Boolean indicating if we are training or testing the model. Returns: - {torch.tensor} -- Preprocessed input array of size (batch_size, input_size). - {torch.tensor} -- Preprocessed target array of size (batch_size, 1). """ ####################################################################### # ** START OF YOUR CODE ** ####################################################################### # -------------------------------------------------------------------- # SORT THE DATA column_names = [ 'longitude', 'latitude', 'housing_median_age', 'total_rooms', 'total_bedrooms', 'population', 'households', 'median_income', 'ocean_proximity' ] # numerical features numeric_features = [ 'longitude', 'latitude', 'housing_median_age', 'total_rooms', 'total_bedrooms', 'population', 'households', 'median_income' ] # get the numerical features features = x[column_names] # -------------------------------------------------------------------- # HANDLE CATEGORICAL FEATURES # Get dummies to transform categorical to Numerical features = pd.get_dummies(features) # Make sure the features are present in the dataset if 'ocean_proximity_ISLAND' not in features.columns.values: features['ocean_proximity_ISLAND'] = 0 elif 'ocean_proximity_NEAR BAY' not in features.columns.values: features['ocean_proximity_NEAR BAY'] = 0 #Drop one column to avoid multicolineariy: 'ocean_proximity_NEAR OCEAN' features = features[[ 'longitude', 'latitude', 'housing_median_age', 'total_rooms', 'total_bedrooms', 'population', 'households', 'median_income', 'ocean_proximity_<1H OCEAN', 'ocean_proximity_INLAND', 'ocean_proximity_ISLAND', 'ocean_proximity_NEAR BAY' ]] outputs = y # -------------------------------------------------------------------- # PRE PROCESSING if (training): #Imput median value to missing values and rescale numeric_transformer = Pipeline( steps=[('imputer', SimpleImputer( strategy='median')), ('scaler', StandardScaler())]) #Transform data to numeric, pass through others ct = ColumnTransformer(transformers=[ ('num', numeric_transformer, numeric_features), ], remainder='passthrough') #Processed data transformed df_processed = ct.fit_transform(X=features) # -------------------------------------------------------------------- # SAVE MODEL #Save the Transfomer in a pkl file dump(ct, open("x_transformer.pkl", "wb")) #Transform y -> is probably not necessary if y is not None: y_scaler = MinMaxScaler() outputs = y_scaler.fit_transform(outputs) dump(y_scaler, open("y_transformer.pkl", "wb")) #If we've seen data before transform with saved preprocessors else: #Load Column Transformer and Transform data ct = load(open('x_transformer.pkl', 'rb')) df_processed = ct.transform(features) #Load Transformer for y if y is not None: y_scaler = load(open('y_transformer.pkl', 'rb')) outputs = y_scaler.transform(outputs) # -------------------------------------------------------------------- # RETURN AS TENSORS x_tensor = torch.tensor(df_processed, dtype=torch.float32) # check if y is in the data if y is not None: y_tensor = torch.tensor(y.values, dtype=torch.float32) return x_tensor, (y_tensor if isinstance(y, pd.DataFrame) else None)
class kickstarter_predictor(): def __init__(self) -> None: self._RSEED=42 self._json_cols=['category', 'location'] self._cat_features_impute = ['country', 'currency', 'category_name', 'location_type'] self._cat_features_onehot = ['country', 'currency', 'category_name', 'location_type'] self.preprocessor = ColumnTransformer( transformers=[ #('cat_impute', SimpleImputer(strategy='constant', fill_value='missing'), self._cat_features_impute), ('cat_onehot', OneHotEncoder(handle_unknown='ignore'), self._cat_features_onehot), ('untouched', 'passthrough', ['duration','goal_usd', 'launched_at_month', 'created_at_month']) #('untouched', 'passthrough', ['deadline','static_usd_rate', 'goal', 'launched_at', 'created_at']) ], sparse_threshold=0 ) self.model = RandomForestClassifier(n_estimators=120, random_state=self._RSEED, max_features = 'sqrt', n_jobs=-1, verbose = 1) try: mkdir('./output') except OSError: print ("Creation of the directory output failed.") def expand_json_cols(self, df): """ Expand columns that contain json objects Parameters --------- df: Pandas DataFrame Returns -------- df: Pandas DataFrame """ df_dicts = pd.DataFrame() print('---------- Parsing json ------------') for col in self._json_cols: print('Parsing json: '+col) c = [] for i, val in df[col].items(): try: c.append(json.loads(val)) except: c.append(dict()) df_dicts[col] = pd.Series(np.array(c)) print('---------- Expanding dictionaries --------') df_expanded = [] for col in df_dicts.columns: print('Expanding: '+col) df_expanded.append(pd.json_normalize(df_dicts[col]).add_prefix(col+'_')) df = pd.concat([df.drop(self._json_cols, axis=1), pd.concat(df_expanded, axis=1)], axis=1) return df def data_cleaning(self, df): """ Filter data frame by relevant columns and rows. Parameters --------- df: Pandas DataFrame Returns -------- df: Pandas DataFrame """ self.base_features = ['country', 'currency', 'category_name', 'location_type', 'goal', 'launched_at', 'created_at', 'blurb', 'state', 'deadline', 'static_usd_rate'] df = df[self.base_features] #df.dropna(inplace=True) df = df.query("state == 'successful' or state == 'failed'") dic = {'successful' : 1, 'failed' : 0} df['state'] = df['state'].map(dic) return df def feature_engineering(self, df): """ Add custom features Parameters --------- df: Pandas DataFrame Returns -------- df: Pandas DataFrame """ df['duration'] = (df.deadline-df.launched_at)/(3600*24) df['duration'] = df['duration'].round(2) df.drop(['deadline'], axis=1, inplace=True) df['goal_usd'] = df['goal'] * df['static_usd_rate'] df['goal_usd'] = df['goal_usd'].round(2) df.drop(['static_usd_rate', 'goal'], axis=1, inplace=True) df['launched_at_full'] = pd.to_datetime(df['launched_at'], unit='s') df['launched_at_month'] = pd.DatetimeIndex(df['launched_at_full']).month df.drop(['launched_at', 'launched_at_full'], axis=1, inplace=True) df['created_at_full'] = pd.to_datetime(df['created_at'], unit='s') df['created_at_month'] = pd.DatetimeIndex(df['created_at_full']).month df.drop(['created_at', 'created_at_full'], axis=1, inplace=True) df['blurb_len'] = [(x.split(" ") if isinstance(x, str) else "") for x in df.blurb] df['blurb_len'] = [len(i) for i in df['blurb_len']] df.drop(['blurb'], axis=1, inplace=True) return df def read_csv(self, name): """ Read csv file in kickstarter format Parameters --------- name: String. Only for display purposes. Returns -------- df: Pandas DataFrame """ file_name = input(f"Please enter {name} csv file name: ") if(not file_name): file_name = './data/Kickstarter003.csv' print(f'Taking default file {file_name}') return pd.read_csv(file_name) def processor_lossy(self, df): """ Apply data frame preprocessing. Outside of sklearn.pipeline Parameters --------- df: Pandas DataFrame Returns -------- df: Pandas DataFrame """ df = self.expand_json_cols(df) df = self.data_cleaning(df) X = df.drop('state', axis=1) y = df.state return X, y def dump_model(self): #r = f"{rmse_score_final:.0f}".replace('.','') t = datetime.now().strftime("%Y-%m-%d_%H%M%S") o = f"./output/model_dump_{t}.pickle" print(f'Dumping model to pickle: {o}') dump(self.model, o) def model_fit_and_export(self): """ Wrapper for fit and export tasks Parameters --------- None Returns -------- None """ df = self.read_csv('train') self.X_train, self.y_train = self.processor_lossy(df) self.X_train = self.feature_engineering(self.X_train) self.X_train = self.preprocessor.fit_transform(self.X_train) self.model.fit(self.X_train, self.y_train) self.dump_model() def model_load(self): """ Load model from pickle file and store in class attribute. Parameters --------- None Returns -------- None """ model_file_name = '' model_file_name = input('Please enter model file name: ') if(model_file_name): self.model=load(model_file_name) else: print('Taking previously trained model.') def printscore(self): print(classification_report(self.y_test, self.y_pred)) def prediction_tocsv(self): #r = f"{rmse_score_final:.0f}".replace('.','') t = datetime.now().strftime("%Y-%m-%d_%H%M%S") o = f"./output/y_pred_{t}.csv" print(f'Writing prediction to csv: {o}') pd.DataFrame(self.y_pred).to_csv(o, index = False) def readcsv_and_predict(self): """ Wrapper for read and predict tasks Parameters --------- None Returns -------- None """ df = self.read_csv('test') self.X_test, self.y_test = self.processor_lossy(df) self.X_test = self.feature_engineering(self.X_test) self.X_test = self.preprocessor.transform(self.X_test) self.y_pred = self.model.predict(self.X_test) self.printscore() self.prediction_tocsv()
df = pd.read_csv('train.csv') df_sub = pd.read_csv('test.csv') case_id = df_sub.id df_sub = df_sub.drop(['id'], axis=1) X = df.iloc[:, 1:11].values y = df.iloc[:, 11].values X_sub = df_sub.iloc[:, :].values from sklearn.preprocessing import OneHotEncoder from sklearn.compose import ColumnTransformer ohe = OneHotEncoder() ctX = ColumnTransformer([('X', ohe, [0, 5, 6])], remainder='passthrough') X = ctX.fit_transform(X) X_sub = ctX.transform(X_sub) from sklearn.preprocessing import StandardScaler sc = StandardScaler() X[:, [7, 11, 13]] = sc.fit_transform(X[:, [7, 11, 13]]) X_sub[:, [7, 11, 13]] = sc.transform(X_sub[:, [7, 11, 13]]) neg, pos = np.bincount(y) total = neg + pos w0 = (1 / neg) * (total) / 2 w1 = (1 / pos) * (total) / 2 #weights = {0: w0, 1: w1} weights = [w0, w1] from imblearn.over_sampling import RandomOverSampler, SMOTE
numeric_transformer = Pipeline( steps=[('imputer', SimpleImputer(strategy='median')), ('scaler', StandardScaler())]) categorical_features = features_train.select_dtypes( include=['category']).columns categorical_transformer = Pipeline( steps=[('imputer', SimpleImputer(strategy='constant', fill_value='missing') ), ('onehot', OneHotEncoder(handle_unknown='ignore'))]) preprocessor = ColumnTransformer( transformers=[('num', numeric_transformer, numeric_features ), ('cat', categorical_transformer, categorical_features)]) features_train = preprocessor.fit_transform(features_train) features_test = preprocessor.transform(features_test) print(features_train.shape) # Train the model. if args.library == 'pytorch': from pytorch_linear import LinearRegressor model = LinearRegressor(batch_size=1000, n_epochs=1, learning_rate=0.01) model.fit(features_train, labels_train) elif args.library == 'sklearn': from sklearn.linear_model import SGDRegressor model = SGDRegressor( max_iter=1, eta0=0.01, learning_rate='constant', tol=None, )
def predict(fn_code, inputVal, predictionVal): if fn_code == 1: return fib_exp_regression(predictionVal) elif fn_code == 3: return merge_sort_poly_regression(predictionVal) else: return linear_regression(predictionVal) actual = [] df_train = pd.read_csv("./final.csv", header=0, index_col=False) actual = df_train["tottime"].tolist() df_train, df_test = train_test_split(df_train, test_size=0.1, random_state=100) target = ["tottime"] numeric_features = ["ncalls"] all_features = numeric_features imputers = [("numeric", SimpleImputer(strategy="median"), numeric_features) ] impute_transformer = ColumnTransformer(transformers=imputers) impute_transformer.fit(df_train) df_train_imp = pd.DataFrame(impute_transformer.transform(df_train), index=df_train.index, columns=all_features) df_test_imp = pd.DataFrame(impute_transformer.transform(df_test), index=df_test.index, columns=all_features) feature_transformers = [('scale', StandardScaler(), numeric_features)] feature_preprocessor = ColumnTransformer(transformers=feature_transformers) feature_preprocessor.fit(df_train_imp) X_train_imp_encode = feature_preprocessor.transform(df_train_imp) X_test_imp_encode = feature_preprocessor.transform(df_test_imp) df_train_imp_encode = pd.DataFrame(X_train_imp_encode, index=df_train_imp.index, columns=numeric_features) df_test_imp_encode = pd.DataFrame(X_test_imp_encode, index=df_test_imp.index, columns=numeric_features) y_train = df_train["tottime"] y_test = df_test["tottime"] # print(X_train_imp_encode.shape) # print("===========") # print(y_train.shape) lr = LinearRegression() lr.fit(X_train_imp_encode, y_train) predictions = [] # now predict can be used with lr for x in range(inputVal + 1, predictionVal + 1): predictions.extend(lr.predict([[x]])) return {"predictions": predictions, "actual": actual}
data["target"], shuffle=False, test_size=0.20) cat = [t != "int64" for t in x_train.dtypes] num = [t == "int64" for t in x_train.dtypes] cat_names = x_train.columns[cat] num_names = x_train.columns[num] transformer = ColumnTransformer( [("num", StandardScaler(), num), ("cat", OneHotEncoder(handle_unknown="ignore"), cat)], ) x_train = transformer.fit_transform(x_train) x_test = transformer.transform(x_test) cat_names = transformer.transformers_[1][1].get_feature_names(cat_names) all_feature_names = list(num_names) all_feature_names.extend(cat_names) model = XGBClassifier(max_depth=5, n_estimators=100, min_child_weight=3, colsample_bytree=0.68, subsample=0.63) model.fit(x_train, y_train, eval_set=[(x_train, y_train), (x_test, y_test)],
def main(file): facts = pd.read_csv(file, encoding='unicode_escape') # drop useless features facts = facts.drop(['_unit_id'], axis=1) facts = facts.drop(["_golden"], axis=1) facts = facts.drop(["_last_judgment_at"], axis=1) facts = facts.drop(["tweet_created"], axis=1) facts = facts.drop(["tweet_id"], axis=1) facts = facts.drop(["tweet_location"], axis=1) facts = facts.drop(["name"], axis=1) facts = facts.drop(["airline_sentiment_gold"], axis=1) # label encoding from sklearn.preprocessing import LabelEncoder, StandardScaler labelencoder = LabelEncoder() facts['label'] = labelencoder.fit_transform(facts['airline_sentiment']) # check label print('2 is', labelencoder.inverse_transform([2])) print('1 is', labelencoder.inverse_transform([1])) print('0 is ', labelencoder.inverse_transform([0])) # drop label facts = facts.drop(["airline_sentiment"], axis=1) # all useful features category_fea = [ '_unit_state', 'airline', 'user_timezone', 'negativereason', 'negativereason_gold' ] num_fea = [ '_trusted_judgments', 'airline_sentiment:confidence', 'negativereason:confidence', 'retweet_count' ] text_fea = ['text'] # fill all null values # fill numerical features with median median1 = facts['negativereason:confidence'].median() facts['negativereason:confidence'] = facts[ 'negativereason:confidence'].fillna(median1) median2 = facts['airline_sentiment:confidence'].median() facts['airline_sentiment:confidence'] = facts[ 'airline_sentiment:confidence'].fillna(median2) # fill categorical features with the most common words common = facts['user_timezone'].mode() facts['user_timezone'] = facts['user_timezone'].fillna( 'Eastern Time (US & Canada)') # no negation reasons, then fill nothing facts['negativereason'] = facts['negativereason'].fillna('none') facts['negativereason_gold'] = facts['negativereason_gold'].fillna('none') facts = facts.drop(["tweet_coord"], axis=1) # split from sklearn.model_selection import train_test_split train, test = train_test_split(facts, test_size=0.2, random_state=42) simplefilter(action='ignore', category=FutureWarning) # bag of words cvec = CountVectorizer( lowercase=False, ngram_range=(1, 2), # vocabulary=whitelist, # You can work with your own whitelist max_features= 5000, # Or work with the top 1000 most frequent items, or... token_pattern= u"(?u)\\b\\S+\\b", # Use these settings if you want to keep punctuation analyzer="word") cvec.fit(train['text']) # preprocessing num_fea = [ '_trusted_judgments', 'airline_sentiment:confidence', 'negativereason:confidence', 'retweet_count' ] num_fea_transformer = Pipeline(steps=[('scaler', StandardScaler())]) category_fea = [ '_unit_state', 'airline', 'user_timezone', 'negativereason' ] categorical_transformer = Pipeline( steps=[('onehot', OneHotEncoder(handle_unknown='ignore')) # For SVM and similar # ('ordinal', OrdinalEncoder()) # For Trees/Gradient Boosting ]) text_fea = ['text'] text_transformer = ColumnTransformer(transformers=[ ('count', cvec, "text"), ]) preprocessor = ColumnTransformer(transformers=[( 'sca', num_fea_transformer, num_fea), ('text', text_transformer, text_fea), ('cat', categorical_transformer, category_fea)]) # encoded train and test data encoded = preprocessor.fit_transform(train) X_test = preprocessor.transform(test) # evaluate models # XGBOOST from sklearn.metrics import classification_report # boost_model = xgb.XGBClassifier().fit(encoded, train['label']) # X_test = preprocessor.transform(test) # preds = boost_model.predict(X_test) # print('classification report for xgboost:\n', classification_report(test["label"], preds)) # boosting from sklearn.ensemble import GradientBoostingClassifier gb_clf = GradientBoostingClassifier(n_estimators=100, random_state=42) gb_clf.fit(encoded, train['label']) predict_gb = gb_clf.predict(X_test) print('classification report for gradient boost:\n', classification_report(test["label"], predict_gb)) # Logitsic regression from sklearn.linear_model import LogisticRegression lg = LogisticRegression() lg.fit(encoded, train['label']) predict_lg = lg.predict(X_test) print('classification report for logistic regression :\n', classification_report(test["label"], predict_lg)) # pasting from sklearn.ensemble import BaggingClassifier from sklearn.tree import DecisionTreeClassifier bag_clf = BaggingClassifier(LogisticRegression(multi_class='auto', solver='lbfgs', max_iter=10000), n_estimators=100, max_samples=int(np.ceil(0.6 * encoded.shape[0])), bootstrap=False, n_jobs=3, random_state=42) bag_clf.fit(encoded, train['label']) pred_bag = bag_clf.predict(X_test) print('classification report for pasting(Logistic regression):\n', classification_report(test["label"], pred_bag)) from sklearn.ensemble import BaggingClassifier from sklearn.tree import DecisionTreeClassifier bag_clf = BaggingClassifier(DecisionTreeClassifier(), n_estimators=100, max_samples=int(np.ceil(0.6 * encoded.shape[0])), bootstrap=False, n_jobs=3, random_state=42) bag_clf.fit(encoded, train['label']) pred_bag = bag_clf.predict(X_test) print('classification report for pasting(Decision Tree Classifier):\n', classification_report(test["label"], pred_bag))
gs.cv_results_['params']): results[np.sqrt(-ms)] = params # #### c. 테스트 # In[82]: final = gs.best_estimator_ # In[83]: strat_test_set.head() # In[88]: x_test = full_pipeline.transform( strat_test_set.drop('median_house_value', axis=1)) y_test = strat_test_set['median_house_value'].copy() # In[98]: a = final.predict(x_test) # In[100]: final # In[99]: from sklearn.metrics import mean_squared_error np.sqrt(mean_squared_error(a, y_test))
'''Scaling and transforming data''' print("--- Scaling and transforming data ---\n") numerical_X_train = X_train.drop(['PULocationID', 'DOLocationID'], axis=1) num_attribs_X_train = list(numerical_X_train) cat_attribs_X_train = ['PULocationID', 'DOLocationID'] numerical_X_test = X_test.drop(['PULocationID', 'DOLocationID'], axis=1) cat_attribs_X_test = ['PULocationID', 'DOLocationID'] scale_transform = ColumnTransformer([ ('scaler', StandardScaler(), num_attribs_X_train), ('cat', OneHotEncoder(handle_unknown='ignore'), cat_attribs_X_train) ]) X_train_prepared = scale_transform.fit_transform(X_train) X_test_prepared = scale_transform.transform(X_test) '''Linear regression''' # print("--- Calculating Linear Regression ---\n") # lin_reg = LinearRegression() # print("\n\tCross validation with RMSE for Linear Regression:") # lin_reg_scores_nMSE = cross_val_score(lin_reg, X_train_prepared, y_train, scoring="neg_mean_squared_error", cv=10, n_jobs=-1) # lin_reg_rmse_scores = np.sqrt(-lin_reg_scores_nMSE) # print("\nScores RMSE:\n", lin_reg_rmse_scores) # print("\nMean score RMSE:\n", lin_reg_rmse_scores.mean()) # print("\nStandard deviation:\n", lin_reg_rmse_scores.std()) # # print("\n\tCross validation with MAE for Linear Regression:") # lin_reg_scores_MAE = cross_val_score(lin_reg, X_train_prepared, y_train, scoring="neg_mean_absolute_error", cv=10, n_jobs=-1) # print("\nScores MAE:\n", lin_reg_scores_MAE) # print("\nMean score MAE:\n", lin_reg_scores_MAE.mean())
housing_prepared housing_prepared.shape # Linear Regression model from sklearn.linear_model import LinearRegression lin_reg = LinearRegression() lin_reg.fit(housing_prepared, housing_labels) # let's try the full preprocessing pipeline on a few training instances some_data = housing.iloc[:5] some_labels = housing_labels.iloc[:5] some_data_prepared = full_pipeline.transform(some_data) print("Predictions:", lin_reg.predict(some_data_prepared)) # Compare predictions with actual data print("Labels:", list(some_labels)) # Measure model's RMSE on whole training set from sklearn.metrics import mean_squared_error housing_predictions = lin_reg.predict(housing_prepared) lin_mse = mean_squared_error(housing_labels, housing_predictions) lin_mse = np.sqrt(lin_mse) lin_mse
def examples(): from sklearn.pipeline import Pipeline from sklearn.svm import SVC from sklearn.decomposition import PCA estimators = [('reduce_dim', PCA()), ('clf', SVC())] pipe = Pipeline(estimators) print(pipe) print(pipe.steps[0]) print(pipe.named_steps['reduce_dim']) pipe.set_params(clf__C=10) print(pipe.named_steps['clf']) ################################################### # 网格搜索,搜索管道中的参数(重要) from sklearn.model_selection import GridSearchCV param_grid = dict(reduce_dim__n_components=[2, 5, 10], clf__C=[0.1, 10, 100]) grid_search = GridSearchCV(pipe, param_grid=param_grid) print(grid_search) ################################################### # 网格搜索,搜索管道中的参数(重要) from sklearn.linear_model import LogisticRegression param_grid = dict(reduce_dim=[None, PCA(5), PCA(10)], clf=[SVC(), LogisticRegression()], clf__C=[0.1, 10, 100]) # 多个可组成列表 grid_search = GridSearchCV(pipe, param_grid=param_grid) print(grid_search) ################################################### from sklearn.pipeline import make_pipeline from sklearn.naive_bayes import MultinomialNB from sklearn.preprocessing import Binarizer pipe = make_pipeline(Binarizer(), MultinomialNB()) print(pipe) ################################################### # 利用memory减少重复计算 from tempfile import mkdtemp from shutil import rmtree from sklearn.decomposition import PCA from sklearn.svm import SVC from sklearn.pipeline import Pipeline estimators = [('reduce_dim', PCA()), ('clf', SVC())] cachedir = mkdtemp() pipe = Pipeline(estimators, memory=cachedir) print(pipe) # Clear the cache directory when you don't need it anymore rmtree(cachedir) ##################################################### # Transforming target in regression import numpy as np from sklearn.datasets import load_boston from sklearn.compose import TransformedTargetRegressor from sklearn.preprocessing import QuantileTransformer from sklearn.linear_model import LinearRegression from sklearn.model_selection import train_test_split boston = load_boston() X = boston.data y = boston.target transformer = QuantileTransformer(output_distribution='normal') regressor = LinearRegression() regr = TransformedTargetRegressor(regressor=regressor, transformer=transformer) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) regr.fit(X_train, y_train) print('R2 score: {0:.2f}'.format(regr.score(X_test, y_test))) raw_target_regr = LinearRegression().fit(X_train, y_train) print('R2 score: {0:.2f}'.format(raw_target_regr.score(X_test, y_test))) ########################################################## # 对每列数据进行处理-预处理 import pandas as pd X = pd.DataFrame({ 'city': ['London', 'London', 'Paris', 'Sallisaw'], 'title': [ "His Last Bow", "How Watson Learned the Trick", "A Moveable Feast", "The Grapes of Wrath" ], 'expert_rating': [5, 3, 4, 5], 'user_rating': [4, 5, 4, 3] }) from sklearn.compose import ColumnTransformer from sklearn.feature_extraction.text import CountVectorizer column_trans = ColumnTransformer( [('city_category', CountVectorizer(analyzer=lambda x: [x]), 'city'), ('title_bow', CountVectorizer(), 'title')], remainder='drop') print(column_trans.fit(X)) print(column_trans.get_feature_names()) print(column_trans.transform(X).toarray())
) ], remainder="passthrough", ) # %% train_x = ct.fit_transform(train_x) clf = DecisionTreeClassifier() clf.fit(train_x, train_y) # %% evaluate(clf, train_x, train_y) # %% test_x = test[selected_columns] test_x["Cabin"] = test_x["Cabin"].fillna("NA") test_x["Embarked"] = test_x["Embarked"].fillna("NA") test_x["Age"] = test_x["Age"].fillna(ave_age) test_x["Fare"] = test_x["Fare"].fillna(train["Fare"].mean()) # %% test_x = ct.transform(test_x) pred = clf.predict(test_x) # %% test_y = truth["Survived"] evaluate(clf, test_x, test_y) # %%
############################################################################### # We will perform a 10-fold cross-validation and train the neural-network with # the two different strategies previously presented. from sklearn.model_selection import StratifiedKFold skf = StratifiedKFold(n_splits=10) cv_results_imbalanced = [] cv_time_imbalanced = [] cv_results_balanced = [] cv_time_balanced = [] for train_idx, valid_idx in skf.split(X_train, y_train): X_local_train = preprocessor.fit_transform(X_train.iloc[train_idx]) y_local_train = y_train.iloc[train_idx].values.ravel() X_local_test = preprocessor.transform(X_train.iloc[valid_idx]) y_local_test = y_train.iloc[valid_idx].values.ravel() elapsed_time, roc_auc = fit_predict_imbalanced_model( X_local_train, y_local_train, X_local_test, y_local_test) cv_time_imbalanced.append(elapsed_time) cv_results_imbalanced.append(roc_auc) elapsed_time, roc_auc = fit_predict_balanced_model( X_local_train, y_local_train, X_local_test, y_local_test) cv_time_balanced.append(elapsed_time) cv_results_balanced.append(roc_auc) ############################################################################### # Plot of the results and computation time ###############################################################################