def _knn_regression(train_table, test_table, feature_cols, label_col, k=5, algorithm='auto', leaf_size=30, p=2, pred_col_name='prediction'): if (test_table.shape[0] == 0): new_cols = test_table.columns.tolist() + [pred_col_name] out_table = pd.DataFrame(columns=new_cols) return {'out_table': out_table} _, X_train = check_col_type(train_table, feature_cols) y_train = train_table[label_col] _, X_test = check_col_type(test_table, feature_cols) knn = KNeighborsRegressor(n_neighbors=k, algorithm=algorithm, leaf_size=leaf_size, p=p) out_col_pred = pd.DataFrame() # Predict the class labels for the provided data knn.fit(X_train, y_train) pred = knn.predict(X_test) out_col_pred[pred_col_name] = pred # Result out_table = pd.concat([test_table.reset_index(drop=True), out_col_pred], axis=1) return {'out_table': out_table}
def _knn_classification(train_table, test_table, feature_cols, label_col, k=5, algorithm='auto', leaf_size=30, p=2, pred_col_name='prediction', prob_col_prefix='probability', suffix='index'): _, X_train = check_col_type(train_table, feature_cols) y_train = train_table[label_col] _, X_test = check_col_type(test_table, feature_cols) if (sklearn_utils.multiclass.type_of_target(y_train) == 'continuous'): raise_error('0718', 'label_col') knn = KNeighborsClassifier(n_neighbors=k, algorithm=algorithm, leaf_size=leaf_size, p=p) # Predict the class labels for the provided data knn.fit(X_train, y_train) classes = knn.classes_ if (test_table.shape[0] == 0): new_cols = test_table.columns.tolist() + [pred_col_name] if suffix == 'index': prob_cols = [ prob_col_prefix + '_{}'.format(i) for i in range(len(classes)) ] else: prob_cols = [prob_col_prefix + '_{}'.format(i) for i in classes] new_cols += prob_cols out_table = pd.DataFrame(columns=new_cols) return {'out_table': out_table} pred = knn.predict(X_test) out_col_pred = pd.DataFrame(pred, columns=[pred_col_name]) if suffix == 'index': suffixes = [i for i, _ in enumerate(classes)] else: suffixes = classes # Return probability estimates for the test data prob = knn.predict_proba(X_test) prob_col_name = [ '{prob_col_prefix}_{suffix}'.format(prob_col_prefix=prob_col_prefix, suffix=suffix) for suffix in suffixes ] out_col_prob = pd.DataFrame(data=prob, columns=prob_col_name) # Result out_table = pd.concat( [test_table.reset_index(drop=True), out_col_pred, out_col_prob], axis=1) return {'out_table': out_table}
def _ada_boost_regression_predict(table, model, pred_col_name='prediction'): out_table = table.copy() regressor = model['regressor'] _, test_data = check_col_type(table, model['params']['feature_cols']) out_table[pred_col_name] = regressor.predict(test_data) return {'out_table': out_table}
def _logistic_regression_predict(table, model, prediction_col='prediction', prob_prefix='probability', output_log_prob=False, log_prob_prefix='log_probability', thresholds=None, suffix='index'): feature_cols = model['features'] feature_names, features = check_col_type(table, feature_cols) lr_model = model['lr_model'] classes = lr_model.classes_ len_classes = len(classes) is_binary = len_classes == 2 if thresholds is None: thresholds = np.array([1 / len_classes for _ in classes]) elif isinstance(thresholds, list): if len(thresholds) == 1 and is_binary and 0 < thresholds[0] < 1: thresholds = np.array([thresholds[0], 1 - thresholds[0]]) else: thresholds = np.array(thresholds) len_thresholds = len(thresholds) if len_classes > 0 and len_thresholds > 0 and len_classes != len_thresholds: # FN-0613='%s' must have length equal to the number of classes. raise_error('0613', ['thresholds']) prob = lr_model.predict_proba(features) prediction = pd.DataFrame(prob).apply( lambda x: classes[np.argmax(x / thresholds)], axis=1) out_table = table.copy() out_table[prediction_col] = prediction if suffix == 'index': suffixes = [i for i, _ in enumerate(classes)] else: suffixes = classes prob_cols = [ '{probability_col}_{suffix}'.format(probability_col=prob_prefix, suffix=suffix) for suffix in suffixes ] prob_df = pd.DataFrame(data=prob, columns=prob_cols) if output_log_prob: log_prob = lr_model.predict_log_proba(features) logprob_cols = [ '{log_probability_col}_{suffix}'.format( log_probability_col=log_prob_prefix, suffix=suffix) for suffix in suffixes ] logprob_df = pd.DataFrame(data=log_prob, columns=logprob_cols) out_table = pd.concat([out_table, prob_df, logprob_df], axis=1) else: out_table = pd.concat([out_table, prob_df], axis=1) return {'out_table': out_table}
def _ada_boost_classification_predict(table, model, pred_col_name='prediction', prob_col_prefix='probability', suffix='index'): if (table.shape[0] == 0): new_cols = table.columns.tolist() + [pred_col_name] classes = model['classifier'].classes_ if suffix == 'index': prob_cols = [prob_col_prefix + '_{}'.format(i) for i in range(len(classes))] else: prob_cols = [prob_col_prefix + '_{}'.format(i) for i in classes] new_cols += prob_cols out_table = pd.DataFrame(columns=new_cols) return {'out_table': out_table} out_table = table.copy() classifier = model['classifier'] _, test_data = check_col_type(table, model['params']['feature_cols']) out_table[pred_col_name] = classifier.predict(test_data) classes = classifier.classes_ if suffix == 'index': suffixes = [i for i, _ in enumerate(classes)] else: suffixes = classes prob = classifier.predict_proba(test_data) prob_col_name = ['{prob_col_prefix}_{suffix}'.format(prob_col_prefix=prob_col_prefix, suffix=suffix) for suffix in suffixes] out_col_prob = pd.DataFrame(data=prob, columns=prob_col_name) out_table = pd.concat([out_table, out_col_prob], axis=1) return {'out_table': out_table}
def _mean_shift_samples_plot(table, input_cols, n_samples, cluster_centers, colors): sample = table[input_cols].sample( n=n_samples) if n_samples is not None else table[input_cols] feature_names, sample = check_col_type(sample, input_cols) sum_len_cols = np.sum([len(col) for col in feature_names]) x = range(len(feature_names)) if sum_len_cols >= 512: plt.xticks(x, feature_names, rotation='vertical') elif sum_len_cols >= 64: plt.xticks(x, feature_names, rotation=45, ha='right') else: plt.xticks(x, feature_names) if feature_names == input_cols: for idx in sample.index: plt.plot(x, sample.transpose()[idx], color='grey', linewidth=1) else: for idx in range(len(sample)): plt.plot(x, sample[idx], color='grey', linewidth=1) for idx, centers in enumerate(cluster_centers): plt.plot(x, centers, "o-", linewidth=4, color=colors[idx]) plt.tight_layout() fig_samples = plt2MD(plt) plt.clf() return fig_samples
def _ada_boost_classification_predict(table, model, pred_col_name='prediction', prob_col_prefix='probability', suffix='index'): out_table = table.copy() classifier = model['classifier'] _, test_data = check_col_type(table, model['params']['feature_cols']) out_table[pred_col_name] = classifier.predict(test_data) classes = classifier.classes_ if suffix == 'index': suffixes = [i for i, _ in enumerate(classes)] else: suffixes = classes prob = classifier.predict_proba(test_data) prob_col_name = [ '{prob_col_prefix}_{suffix}'.format(prob_col_prefix=prob_col_prefix, suffix=suffix) for suffix in suffixes ] out_col_prob = pd.DataFrame(data=prob, columns=prob_col_name) out_table = pd.concat([out_table, out_col_prob], axis=1) return {'out_table': out_table}
def _ada_boost_regression_train(table, feature_cols, label_col, max_depth=3, n_estimators=50, learning_rate=1.0, loss='linear', random_state=None): feature_names, x_train = check_col_type(table, feature_cols) y_train = table[label_col] base_estimator = DecisionTreeRegressor(max_depth=max_depth) regressor = AdaBoostRegressor(base_estimator, n_estimators, learning_rate, loss, random_state) regressor.fit(x_train, y_train) params = { 'feature_cols': feature_cols, 'label_col': label_col, 'feature_importance': regressor.feature_importances_, 'n_estimators': n_estimators, 'learning_rate': learning_rate, 'loss': loss, 'random_state': random_state } model = _model_dict('ada_boost_regression_model') get_param = regressor.get_params() model['parameters'] = get_param model['regressor'] = regressor model['params'] = params fig_feature_importance = _plot_feature_importance(feature_names, regressor) params = dict2MD(get_param) rb = BrtcReprBuilder() rb.addMD( strip_margin(""" | ## AdaBoost Regression Train Result | | ### Feature Importance | {fig_feature_importance} | | ### Parameters | {list_parameters} | """.format(fig_feature_importance=fig_feature_importance, list_parameters=params))) model['_repr_brtc_'] = rb.get() feature_importance = regressor.feature_importances_ feature_importance_table = pd.DataFrame( [[feature_names[i], feature_importance[i]] for i in range(len(feature_names))], columns=['feature_name', 'importance']) model['feature_importance_table'] = feature_importance_table return {'model': model}
def _logistic_regression_train(table, feature_cols, label_col, penalty='l2', dual=False, tol=0.0001, C=1.0, fit_intercept=True, intercept_scaling=1, class_weight=None, random_state=None, solver='liblinear', max_iter=100, multi_class='ovr', verbose=0, warm_start=False, n_jobs=1): feature_names, features = check_col_type(table,feature_cols) label = table[label_col] if(sklearn_utils.multiclass.type_of_target(label) == 'continuous'): raise_error('0718', 'label_col') lr_model = LogisticRegression(penalty, dual, tol, C, fit_intercept, intercept_scaling, class_weight, random_state, solver, max_iter, multi_class, verbose, warm_start, n_jobs) lr_model.fit(features, label) intercept = lr_model.intercept_ coefficients = lr_model.coef_ classes = lr_model.classes_ is_binary = len(classes) == 2 if (fit_intercept == True): summary = pd.DataFrame({'features': ['intercept'] + feature_names}) print(intercept) print(coefficients) coef_trans = np.concatenate(([intercept], np.transpose(coefficients)), axis=0) if not is_binary: summary = pd.concat((summary, pd.DataFrame(coef_trans, columns=classes)), axis=1) elif is_binary: summary = pd.concat((summary, pd.DataFrame(coef_trans, columns=[classes[0]])), axis=1) else: summary = pd.DataFrame({'features': feature_names}) coef_trans = np.transpose(coefficients) if not is_binary: summary = pd.concat((summary, pd.DataFrame(coef_trans, columns=classes)), axis=1) elif is_binary: summary = pd.concat((summary, pd.DataFrame(coef_trans, columns=[classes[0]])), axis=1) rb = BrtcReprBuilder() rb.addMD(strip_margin(""" | ## Logistic Regression Result | ### Summary | {table1} """.format(table1=pandasDF2MD(summary) ))) model = _model_dict('logistic_regression_model') model['features'] = feature_cols model['label'] = label_col model['intercept'] = lr_model.intercept_ model['coefficients'] = lr_model.coef_ model['class'] = lr_model.classes_ model['penalty'] = penalty model['solver'] = solver model['lr_model'] = lr_model model['_repr_brtc_'] = rb.get() return {'model' : model}
def _pls_regression_predict(table, model, prediction_col='prediction'): result = table.copy() feature_cols = model['feature_cols'] _, features = check_col_type(result, feature_cols) pls_model = model['pls_model'] prediction = pls_model.predict(features) for i in range(prediction.shape[-1]): result[prediction_col+"_{}".format(i)] = prediction[:, i] return {'out_table': result}
def _mean_shift_predict(table, model, prediction_col='prediction'): ms = model['model'] input_cols = model['input_cols'] _, inputarr = check_col_type(table, input_cols) predict = ms.predict(inputarr) out_table = table.copy() out_table[prediction_col] = predict return {'out_table': out_table}
def _random_forest_regression_train(table, feature_cols, label_col, n_estimators=10, criterion="mse", max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0, max_features="None", max_leaf_nodes=None, min_impurity_decrease=0, random_state=None): feature_names, X_train = check_col_type(table, feature_cols) y_train = table[label_col] if max_features == "None": max_features = None regressor = RandomForestRegressor(n_estimators =n_estimators, criterion = criterion, max_depth= max_depth, min_samples_split = min_samples_split, min_samples_leaf = min_samples_leaf, min_weight_fraction_leaf = min_weight_fraction_leaf, max_features = max_features, max_leaf_nodes = max_leaf_nodes, min_impurity_decrease = min_impurity_decrease, random_state=random_state) regressor.fit(X_train, y_train) params = {'feature_cols': feature_cols, 'label_col': label_col, 'n_estimators': n_estimators, 'criterion': criterion, 'max_depth': max_depth, 'min_samples_split': min_samples_split, 'min_samples_leaf': min_samples_leaf, 'min_weight_fraction_leaf': min_weight_fraction_leaf, 'max_features': max_features, 'max_leaf_nodes': max_leaf_nodes, 'min_impurity_decrease': min_impurity_decrease, 'random_state': random_state} model = _model_dict('random_forest_regression_model') model['regressor'] = regressor model['params'] = params fig_feature_importances = _plot_feature_importances(feature_names, regressor) rb = BrtcReprBuilder() rb.addMD(strip_margin(""" | ## Random Forest Regression Train Result | | ### Feature Importance | {fig_feature_importances} | """.format(fig_feature_importances=fig_feature_importances))) model['_repr_brtc_'] = rb.get() feature_importance = regressor.feature_importances_ feature_importance_table = pd.DataFrame([[feature_names[i],feature_importance[i]] for i in range(len(feature_names))],columns = ['feature_name','importance']) model['feature_importance_table'] = feature_importance_table return {'model' : model}
def _decision_tree_regression_predict(table, model, prediction_col='prediction', check_input=True): out_table = table.copy() feature_cols = model['feature_cols'] feature_names, features = check_col_type(table,feature_cols) regressor = model['regressor'] prediction = regressor.predict(features, check_input) out_table[prediction_col] = prediction return {'out_table': out_table}
def _decision_tree_classification_predict(table, model, prediction_col='prediction', check_input=True): out_table = table.copy() feature_cols = model['feature_cols'] feature_names, features = check_col_type(table, feature_cols) out_table[prediction_col] = model['classifier'].predict( features, check_input) return {'out_table': out_table}
def _pls_regression_train(table, feature_cols, label_cols, n_components=2, scale=True, max_iter=500, tol=1e-6): pls_model = PLS(n_components=n_components, scale=scale, max_iter=max_iter, tol=tol) _, features = check_col_type(table, feature_cols) _, labels = check_col_type(table, label_cols) pls_model.fit(features, labels) predict = pls_model.predict(features) _mean_absolute_error = mean_absolute_error(labels, predict) _mean_squared_error = mean_squared_error(labels, predict) _r2_score = r2_score(labels, predict) result_table = pd.DataFrame.from_items([ ['Metric', ['Mean Absolute Error', 'Mean Squared Error', 'R2 Score']], ['Score', [_mean_absolute_error, _mean_squared_error, _r2_score]] ]) label_name = { 'n_components': 'Number of components', 'scale': "Scale", 'max_iter': 'Max iteration', 'tol': 'Tolerance' } get_param = pls_model.get_params() param_table = pd.DataFrame.from_items([ ['Parameter', list(label_name.values())], ['Value', [get_param[x] for x in list(label_name.keys())]] ]) rb = BrtcReprBuilder() rb.addMD(strip_margin(""" | ### PLS Regression Result | {result} | ### Parameters | {list_parameters} """.format(result=pandasDF2MD(result_table), list_parameters=pandasDF2MD(param_table) ))) model = _model_dict('pls_regression_model') model['feature_cols'] = feature_cols model['label'] = label_cols model['mean_absolute_error'] = _mean_absolute_error model['mean_squared_error'] = _mean_squared_error model['r2_score'] = _r2_score model['max_iter'] = max_iter model['tol'] = tol model['pls_model'] = pls_model model['_repr_brtc_'] = rb.get() return {'model': model}
def _ada_boost_regression_predict(table, model, pred_col_name='prediction'): if (table.shape[0] == 0): new_cols = table.columns.tolist() + [pred_col_name] out_table = pd.DataFrame(columns=new_cols) return {'out_table': out_table} out_table = table.copy() regressor = model['regressor'] _, test_data = check_col_type(table, model['params']['feature_cols']) out_table[pred_col_name] = regressor.predict(test_data) return {'out_table': out_table}
def _kmeans_predict(table, model, prediction_col='prediction'): if model['_context'] == 'python' and model['_type'] == 'kmeans': k_means = model['model'] input_cols = model['input_cols'] feature_names, features = check_col_type(table, input_cols) predict = k_means.predict(features) out_table = table.copy() out_table[prediction_col] = predict elif model['_context'] == 'python' and model['_type'] == 'kmeans_silhouette': k_means = model['best_model'] input_cols = model['input_cols'] feature_names, features = check_col_type(table, input_cols) predict = k_means.predict(features) out_table = table.copy() out_table[prediction_col] = predict else: raise_runtime_error("Unsupported model") # raise Exception("Unsupported model") return {'out_table':out_table}
def _penalized_linear_regression_predict(table, model, prediction_col='prediction'): result = table.copy() feature_cols = model['feature_cols'] feature_names, features = check_col_type(result, feature_cols) regression_model = model['regression_model'] prediction = regression_model.predict(features) result[prediction_col] = prediction return {'out_table': result}
def _gaussian_mixture_predict(table, model, display_probability, prediction_col_name='prediction'): out_table = table.copy() _, inputarr = check_col_type(table, model['input_cols']) out_table[prediction_col_name] = model['gmm'].predict(inputarr) if display_probability == True: for i in range(0, model['number_of_components']): out_table['probability_' + str(i)] = pd.DataFrame( model['gmm'].predict_proba(table[model['input_cols']]))[i] return {'out_table': out_table}
def _mlp_regression_predict(table, model, prediction_col='prediction'): result = table.copy() feature_cols = model['features'] _, features = check_col_type(result, feature_cols) mlp_model_fit = model['mlp_model'] prediction = mlp_model_fit.predict(features) result[prediction_col] = prediction return {'out_table': result}
def _svm_classification_train(table, feature_cols, label_col, c=1.0, kernel='rbf', degree=3, gamma='auto', coef0=0.0, shrinking=True, probability=True, tol=1e-3, max_iter=-1, random_state=None): _table = table.copy() feature_names, features = check_col_type(table, feature_cols) _label_col = _table[label_col] if (sklearn_utils.multiclass.type_of_target(_label_col) == 'continuous'): raise_runtime_error('''Label Column should not be continuous.''') _svc = svm.SVC(C=c, kernel=kernel, degree=degree, gamma=gamma, coef0=coef0, shrinking=shrinking, probability=probability, tol=tol, max_iter=max_iter, random_state=random_state) _svc_model = _svc.fit(features, _label_col) get_param = _svc.get_params() get_param['feature_cols'] = feature_names get_param['label_col'] = label_col rb = BrtcReprBuilder() rb.addMD( strip_margin(""" | ## SVM Classification Result | ### Parameters | {table_parameter} """.format(table_parameter=dict2MD(get_param)))) _model = _model_dict('svc_model') _model['svc_model'] = _svc_model _model['features'] = feature_cols _model['_repr_brtc_'] = rb.get() return {'model': _model}
def _kmeans_train_predict(table, input_cols, n_clusters=3, prediction_col='prediction', init='k-means++', n_init=10, max_iter=300, tol=1e-4, precompute_distances='auto', seed=None, n_jobs=1, algorithm='auto', n_samples=None): feature_names, inputarr = check_col_type(table, input_cols) if n_samples is None: n_samples = len(inputarr) k_means = SKKMeans(n_clusters=n_clusters, init=init, n_init=n_init, max_iter=max_iter, tol=tol, precompute_distances=precompute_distances, verbose=0, random_state=seed, copy_x=True, n_jobs=n_jobs, algorithm=algorithm) k_means.fit(inputarr) params = {'input_cols':feature_names, 'n_clusters':n_clusters, 'init':init, 'n_init':n_init, 'max_iter':max_iter, 'tol':tol, 'precompute_distances':precompute_distances, 'seed':seed, 'n_jobs':n_jobs, 'algorithm':algorithm, 'n_samples':n_samples} cluster_centers = k_means.cluster_centers_ n_clusters = len(cluster_centers) colors = cm.nipy_spectral(np.arange(n_clusters).astype(float) / n_clusters) labels = k_means.labels_ pca2_model = PCA(n_components=min(2, len(feature_names))).fit(inputarr) pca2 = pca2_model.transform(inputarr) fig_centers = _kmeans_centers_plot(feature_names, cluster_centers, colors) fig_samples = _kmeans_samples_plot(table, input_cols, n_samples, cluster_centers, seed, colors) fig_pca = _kmeans_pca_plot(labels, cluster_centers, pca2_model, pca2, colors) rb = BrtcReprBuilder() rb.addMD(strip_margin(""" | ## Kmeans Result | - Number of iterations run: {n_iter_}. | - Sum of square error: {sse_}. | - Coordinates of cluster centers | {fig_cluster_centers} | - Samples | {fig_pca} | {fig_samples} | | ### Parameters | {params} """.format(n_iter_=k_means.n_iter_, sse_=k_means.inertia_, fig_cluster_centers=fig_centers, fig_pca=fig_pca, fig_samples=fig_samples, params=dict2MD(params)))) model = _model_dict('kmeans') model['model'] = k_means model['input_cols'] = input_cols model['_repr_brtc_'] = rb.get() out_table = table.copy() out_table[prediction_col] = labels return {'out_table':out_table, 'model':model}
def _xgb_regression_predict(table, model, prediction_col='prediction', output_margin=False, ntree_limit=None): feature_cols = model['feature_cols'] feature_names, features = check_col_type(table, feature_cols) regressor = model['regressor'] prediction = regressor.predict(features, output_margin, ntree_limit) # prediction_df = pd.DataFrame(data = prediction) # # out_df = pd.concat([table.reset_index(drop=True), prediction_df], axis=1) # out_df.columns = table.columns.values.tolist() + [prediction_col] out_table = table.copy() out_table[prediction_col] = prediction return {'out_table': out_table}
def _glm_predict(table, model, prediction_col='prediction'): feature_cols = model['features'] feature_names, features = check_col_type(table, feature_cols) fit_intercept = model['fit_intercept'] glm_model = model['glm_model'] if fit_intercept == True: prediction = glm_model.predict(sm.add_constant(features)) else: prediction = glm_model.predict(features) result = table.copy() result[prediction_col] = prediction return {'out_table': result}
def _xgb_classification_predict(table, model, prediction_col='prediction', probability_col='probability', thresholds=None, suffix='index', output_margin=False, ntree_limit=None): feature_cols = model['feature_cols'] classifier = model['classifier'] # prediction = classifier.predict(table[feature_cols], output_margin, ntree_limit) _, features = check_col_type(table, feature_cols) classes = classifier.classes_ len_classes = len(classes) is_binary = len_classes == 2 if thresholds is None: thresholds = np.array([1 / len_classes for _ in classes]) elif isinstance(thresholds, list): if len(thresholds) == 1 and is_binary and 0 < thresholds[0] < 1: thresholds = np.array([thresholds[0], 1 - thresholds[0]]) else: thresholds = np.array(thresholds) prob = classifier.predict_proba(features, ntree_limit) prediction = classes[np.argmax(prob / thresholds, axis=1)] if suffix == 'index': suffixes = [i for i, _ in enumerate(classes)] else: suffixes = classes prob_cols = [ '{probability_col}_{suffix}'.format(probability_col=probability_col, suffix=suffix) for suffix in suffixes ] prob_df = pd.DataFrame(data=prob, columns=prob_cols) result = table.copy() result[prediction_col] = prediction result = pd.concat([result, prob_df], axis=1) return {'out_table': result}
def _naive_bayes_predict(table, model, suffix, display_log_prob=False, prediction_col='prediction', prob_prefix='probability', log_prob_prefix='log_probability'): if 'features' in model: feature_cols = model['features'] else: feature_cols = model['feature_cols'] feature_names, features = check_col_type(table, feature_cols) if 'nb_model' in model: nb_model = model['nb_model'] else: model_table = model['table_1'] if model_table.model_type[0] == 'multinomial': nb_model = MultinomialNB() else: nb_model = BernoulliNB() nb_model.fit([[1]], [1]) nb_model.classes_ = np.array([0, 1]) nb_model.class_log_prior_ = model_table.pi.values nb_model.feature_log_prob_ = np.array(list(model_table.theta)) prediction = nb_model.predict(features) if 'label_encoder' in model: label_encoder = model['label_encoder'] prediction = label_encoder.inverse_transform(prediction) if suffix == 'label': suffixes = label_encoder.classes_ else: suffixes = range(0, len(label_encoder.classes_)) else: suffixes = [0, 1] prob = nb_model.predict_proba(features) prob_cols = ['{prefix}_{suffix}'.format(prefix=prob_prefix, suffix=suffix) for suffix in suffixes] prob_df = pd.DataFrame(data=prob, columns=prob_cols) result = table result[prediction_col] = prediction if display_log_prob == True: log_prob = nb_model.predict_log_proba(features) logprob_cols = ['{prefix}_{suffix}'.format(prefix=log_prob_prefix, suffix=suffix) for suffix in suffixes] logprob_df = pd.DataFrame(data=log_prob, columns=logprob_cols) result = pd.concat([result, prob_df, logprob_df], axis=1) else: result = pd.concat([result, prob_df], axis=1) return {'out_table' : result}
def _linear_regression_predict(table, model, prediction_col='prediction'): result = table.copy() feature_cols = model['features'] feature_names, features = check_col_type(result, feature_cols) fit_intercept = model['fit_intercept'] lr_model_fit = model['lr_model'] if fit_intercept == True: features = sm.add_constant(features, has_constant='add') prediction = lr_model_fit.predict(features) else: prediction = lr_model_fit.predict(features) result[prediction_col] = prediction return {'out_table': result}
def _kmeans_samples_plot(table, input_cols, n_samples, cluster_centers, seed, colors): feature_names, inputarr = check_col_type(table, input_cols) sum_len_cols = np.sum([len(col) for col in feature_names]) sample = pd.DataFrame(inputarr).sample(n=n_samples, random_state=seed) x = range(len(feature_names)) if sum_len_cols >= 512: plt.xticks(x, feature_names, rotation='vertical') elif sum_len_cols >= 64: plt.xticks(x, feature_names, rotation=45, ha='right') else: plt.xticks(x, feature_names) for idx in sample.index: plt.plot(x, sample.transpose()[idx], color='grey', linewidth=1) for idx, centers in enumerate(cluster_centers): plt.plot(x, centers, "o-", label=idx, linewidth=2, color=colors[idx]) plt.tight_layout() fig_samples = plt2MD(plt) plt.clf() return fig_samples
def _naive_bayes_predict(table, model, suffix, display_log_prob=False, prediction_col='prediction', prob_prefix='probability', log_prob_prefix='log_probability'): feature_cols = model['features'] feature_names, features = check_col_type(table, feature_cols) nb_model = model['nb_model'] label_encoder = model['label_encoder'] prediction_correspond = nb_model.predict(features) prediction = label_encoder.inverse_transform(prediction_correspond) if suffix == 'label': suffixes = label_encoder.classes_ else: suffixes = range(0, len(label_encoder.classes_)) prob = nb_model.predict_proba(features) prob_cols = [ '{prefix}_{suffix}'.format(prefix=prob_prefix, suffix=suffix) for suffix in suffixes ] prob_df = pd.DataFrame(data=prob, columns=prob_cols) result = table result[prediction_col] = prediction if display_log_prob == True: log_prob = nb_model.predict_log_proba(features) logprob_cols = [ '{prefix}_{suffix}'.format(prefix=log_prob_prefix, suffix=suffix) for suffix in suffixes ] logprob_df = pd.DataFrame(data=log_prob, columns=logprob_cols) result = pd.concat([result, prob_df, logprob_df], axis=1) else: result = pd.concat([result, prob_df], axis=1) return {'out_table': result}
def _logistic_regression_train(table, feature_cols, label_col, penalty='l2', dual=False, tol=0.0001, C=1.0, fit_intercept=True, intercept_scaling=1, class_weight=None, random_state=None, solver='liblinear', max_iter=100, multi_class='ovr', verbose=0, warm_start=False, n_jobs=1): feature_names, features = check_col_type(table, feature_cols) label = table[label_col] if (sklearn_utils.multiclass.type_of_target(label) == 'continuous'): raise_error('0718', 'label_col') lr_model = LogisticRegression(penalty, dual, tol, C, fit_intercept, intercept_scaling, class_weight, random_state, solver, max_iter, multi_class, verbose, warm_start, n_jobs) lr_model.fit(features, label) new_features = pd.DataFrame({ "Constant": np.ones(len(features)) }).join(pd.DataFrame(features)) intercept = lr_model.intercept_ coefficients = lr_model.coef_ classes = lr_model.classes_ is_binary = len(classes) == 2 prob = lr_model.predict_proba(features) prob_trans = prob.T classes_dict = dict() for i in range(len(classes)): classes_dict[classes[i]] = i tmp_label = np.array([classes_dict[i] for i in label]) likelihood = 1 for i in range(len(table)): likelihood *= prob_trans[tmp_label[i]][i] if fit_intercept: k = len(feature_cols) + 1 else: k = len(feature_cols) aic = 2 * k - 2 * np.log(likelihood) bic = np.log(len(table)) * k - 2 * np.log(likelihood) if is_binary: if fit_intercept: x_design = np.hstack([np.ones((features.shape[0], 1)), features]) else: x_design = features.values v = np.product(prob, axis=1) x_design_modi = np.array( [x_design[i] * v[i] for i in range(len(x_design))]) cov_logit = np.linalg.inv(np.dot(x_design_modi.T, x_design)) std_err = np.sqrt(np.diag(cov_logit)) if fit_intercept: logit_params = np.insert(coefficients, 0, intercept) else: logit_params = coefficients wald = (logit_params / std_err)**2 p_values = 1 - chi2.cdf(wald, 1) else: if fit_intercept: x_design = np.hstack([np.ones((features.shape[0], 1)), features]) else: x_design = features.values std_err = [] for i in range(len(classes)): v = prob.T[i] * (1 - prob.T[i]) x_design_modi = np.array( [x_design[i] * v[i] for i in range(len(x_design))]) cov_logit = np.linalg.inv(np.dot(x_design_modi.T, x_design)) std_err.append(np.sqrt(np.diag(cov_logit))) std_err = np.array(std_err) #print(math.log(likelihood)) if (fit_intercept == True): summary = pd.DataFrame({'features': ['intercept'] + feature_names}) coef_trans = np.concatenate(([intercept], np.transpose(coefficients)), axis=0) else: summary = pd.DataFrame({'features': feature_names}) coef_trans = np.transpose(coefficients) if not is_binary: summary = pd.concat( (summary, pd.DataFrame(coef_trans, columns=classes)), axis=1) else: summary = pd.concat( (summary, pd.DataFrame(coef_trans, columns=[classes[0]])), axis=1) if is_binary: summary = pd.concat( (summary, pd.DataFrame(std_err, columns=['standard_error']), pd.DataFrame(wald, columns=['wald_statistic']), pd.DataFrame(p_values, columns=['p_value'])), axis=1) else: columns = [ 'standard_error_{}'.format(classes[i]) for i in range(len(classes)) ] summary = pd.concat( (summary, pd.DataFrame(std_err.T, columns=columns)), axis=1) arrange_col = ['features'] for i in range(len(classes)): arrange_col.append(classes[i]) arrange_col.append('standard_error_{}'.format(classes[i])) summary = summary[arrange_col] if is_binary: rb = BrtcReprBuilder() rb.addMD( strip_margin(""" | ## Logistic Regression Result | ### Summary | {table1} | | ##### Column '{small}' is the coefficients under the assumption ({small} = 0, {big} = 1). | | #### AIC : {aic} | | #### BIC : {bic} """.format(small=classes[0], big=classes[1], table1=pandasDF2MD(summary, num_rows=100), aic=aic, bic=bic))) else: rb = BrtcReprBuilder() rb.addMD( strip_margin(""" | ## Logistic Regression Result | ### Summary | {table1} | | ##### Each column whose name is one of classes of Label Column is the coefficients under the assumption it is 1 and others are 0. | | ##### For example, column '{small}' is the coefficients under the assumption ({small} = 1, others = 0). | | #### AIC : {aic} | | #### BIC : {bic} """.format(small=classes[0], table1=pandasDF2MD(summary, num_rows=100), aic=aic, bic=bic))) model = _model_dict('logistic_regression_model') model['standard_errors'] = std_err model['aic'] = aic model['bic'] = bic if is_binary: model['wald_statistics'] = wald model['p_values'] = p_values model['features'] = feature_cols model['label'] = label_col model['intercept'] = lr_model.intercept_ model['coefficients'] = lr_model.coef_ model['class'] = lr_model.classes_ model['penalty'] = penalty model['solver'] = solver model['lr_model'] = lr_model model['_repr_brtc_'] = rb.get() model['summary'] = summary return {'model': model}