def _svc_train(table, feature_cols, label_col, c=1.0, kernel='rbf', degree=3, gamma='auto', coef0=0.0, shrinking=True, probability=True, tol=1e-3, max_iter=-1, random_state=None): _table = table.copy() _feature_cols = _table[feature_cols] _label_col = _table[label_col] _svc = svm.SVC(C=c, kernel=kernel, degree=degree, gamma=gamma, coef0=coef0, shrinking=shrinking, probability=probability, tol=tol, max_iter=max_iter, random_state=random_state) _svc_model = _svc.fit(_feature_cols, _label_col) get_param = _svc.get_params() get_param['feature_cols'] = feature_cols get_param['label_col'] = label_col rb = ReportBuilder() rb.addMD( strip_margin(""" | ## SVM Classification Result | ### Parameters | {table_parameter} """.format(table_parameter=dict2MD(get_param)))) _model = _model_dict('svc_model') _model['svc_model'] = _svc_model _model['features'] = feature_cols _model['report'] = rb.get() return {'model': _model}
def _hierarchical_clustering_post(table, model, num_clusters, cluster_col='prediction'): Z = model['model'] input_cols = model['input_cols'] params = model['parameters'] out_table = model['outtable'] predict = fcluster(Z, t=num_clusters, criterion='maxclust') out_table2 = table.copy() out_table2[cluster_col] = predict L, M = leaders(Z, predict) which_cluster = [] for leader in L: if leader in Z[:, 0]: select_indices = np.where(Z[:, 0] == leader)[0][0] which_cluster.append(out_table['joined_column1'][select_indices]) elif leader in Z[:, 1]: select_indices = np.where(Z[:, 1] == leader)[0][0] which_cluster.append(out_table['joined_column2'][select_indices]) out_table3 = pd.DataFrame([]) out_table3[cluster_col] = M out_table3['name_of_clusters'] = which_cluster out_table3 = out_table3.sort_values(cluster_col) cluster_count = np.bincount(out_table2[cluster_col]) cluster_count = cluster_count[cluster_count != 0] # data = {'cluster_name': ['prediction' + str(i) for i in range(1, num_clusters + 1)]} out_table3['num_of_entities'] = list(cluster_count) rb = ReportBuilder() rb.addMD(strip_margin("""### Hierarchical Clustering Post Process Result""")) rb.addMD(strip_margin(""" |### Parameters | |{display_params} | |## Clusters Information | |{out_table3} | """.format(display_params=dict2MD(params), out_table3=pandasDF2MD(out_table3)))) model = _model_dict('hierarchical_clustering_post') model['report'] = rb.get() return {'out_table2' : out_table2, 'model': model}
def _outlier_detection_lof(table, input_cols, choice='add_prediction', n_neighbors=20, new_column_name='is_outlier'): # algorithm='auto', leaf_size=30, # metric='minkowski', p=2, contamination=0.1, out_table = table.copy() lof_model = LocalOutlierFactor(n_neighbors, algorithm='auto', leaf_size=30, metric='minkowski', p=2, contamination=0.1) lof_model.fit_predict(out_table[input_cols]) isinlier = lambda _: 'in' if _ == 1 else 'out' out_table[new_column_name] = [isinlier(lof_predict) for lof_predict in lof_model.fit_predict(out_table[input_cols])] if choice == 'add_prediction': pass elif choice == 'remove_outliers': out_table = out_table[out_table[new_column_name] == 'in'] out_table = out_table.drop(new_column_name, axis=1) elif choice == 'both': out_table = out_table[out_table[new_column_name] == 'in'] params = { 'Input Columns': input_cols, 'Result Type': choice, 'Number of Neighbors': n_neighbors, # 'Algorithm': algorithm, # 'Metric': metric, # 'Contamination': contamination } rb = ReportBuilder() rb.addMD(strip_margin(""" | ## Outlier Detection (Local Outlier Factor) Result | ### Parameters | | {display_params} """.format(display_params=dict2MD(params)))) model = _model_dict('outlier_detection_lof') model['params'] = params model['lof_model'] = lof_model model['report'] = rb.get() return {'out_table':out_table, 'model':model}
def _pca(table, input_cols, new_column_name='projected_', n_components=None, copy=True, whiten=False, svd_solver='auto', tol=0.0, iterated_power='auto', random_state=None, hue=None, alpha=0, key_col=None): num_feature_cols = len(input_cols) if n_components is None: n_components = num_feature_cols pca = PCA(None, copy, whiten, svd_solver, tol, iterated_power, random_state) pca_model = pca.fit(table[input_cols]) column_names = [] for i in range(0, n_components): column_names.append(new_column_name + str(i)) # print(column_names) pca_result = pca_model.transform(table[input_cols]) out_df = pd.DataFrame(data=pca_result[:, :n_components], columns=[column_names]) out_df = pd.concat([table.reset_index(drop=True), out_df], axis=1) out_df.columns = table.columns.values.tolist() + column_names res_components = pca_model.components_ res_components_df = pd.DataFrame(data=res_components[:n_components], columns=[input_cols]) res_explained_variance = pca_model.explained_variance_ res_explained_variance_ratio = pca_model.explained_variance_ratio_ res_singular_values = pca_model.singular_values_ res_mean = pca_model.mean_ res_n_components = pca_model.n_components_ res_noise_variance = pca_model.noise_variance_ res_get_param = pca_model.get_params() res_get_covariance = pca_model.get_covariance() res_get_precision = pca_model.get_precision() # visualization plt.figure() if n_components == 1: sns.scatterplot(column_names[0], column_names[0], hue=hue, data=out_df) plt_two = plt2MD(plt) plt.clf() else: plt_two = _biplot( 0, 1, pc_columns=column_names, columns=input_cols, singular_values=res_singular_values, components=res_components, explained_variance_ratio=res_explained_variance_ratio, alpha=alpha, hue=hue, data=out_df, ax=plt.gca(), key_col=key_col) plt.figure() fig_scree = _screeplot(res_explained_variance, res_explained_variance_ratio, n_components) table_explained_variance = pd.DataFrame(res_explained_variance, columns=['explained_variance']) table_explained_variance[ 'explained_variance_ratio'] = res_explained_variance_ratio table_explained_variance[ 'cum_explained_variance_ratio'] = res_explained_variance_ratio.cumsum( ) rb = ReportBuilder() rb.addMD( strip_margin(""" | ## PCA Result | ### Plot | {image1} | | ### Explained Variance | {fig_scree} | {table_explained_variance} | | ### Components | {table2} | | ### Parameters | {parameter1} """.format(image1=plt_two, fig_scree=fig_scree, table_explained_variance=pandasDF2MD(table_explained_variance), parameter1=dict2MD(res_get_param), table2=pandasDF2MD(res_components_df)))) model = _model_dict('pca') model['components'] = res_components model['explained_variance'] = res_explained_variance model['explained_variance_ratio'] = res_explained_variance_ratio model['singular_values'] = res_singular_values model['mean'] = res_mean model['n_components'] = res_n_components model['noise_variance'] = res_noise_variance model['parameters'] = res_get_param model['covariance'] = res_get_covariance model['precision'] = res_get_precision model['report'] = rb.get() model['pca_model'] = pca_model model['input_cols'] = input_cols return {'out_table': out_df, 'model': model}
def _pca(table, input_cols, new_column_name='projected_', n_components=None, copy=True, whiten=False, svd_solver='auto', tol=0.0, iterated_power='auto', random_state=None): num_feature_cols = len(input_cols) if n_components is None: n_components = num_feature_cols pca = PCA(n_components, copy, whiten, svd_solver, tol, iterated_power, random_state) pca_model = pca.fit(table[input_cols]) column_names = [] for i in range(0, n_components): column_names.append(new_column_name + str(i)) # print(column_names) pca_result = pca_model.transform(table[input_cols]) out_df = pd.DataFrame(data=pca_result, columns=[column_names]) res_components = pca_model.components_ res_components_df = pd.DataFrame(data=res_components, columns=[input_cols]) res_explained_variance = pca_model.explained_variance_ res_explained_variance_ratio = pca_model.explained_variance_ratio_ res_singular_values = pca_model.singular_values_ res_mean = pca_model.mean_ res_n_components = pca_model.n_components_ res_noise_variance = pca_model.noise_variance_ res_get_param = pca_model.get_params() res_get_covariance = pca_model.get_covariance() res_get_precision = pca_model.get_precision() # visualization plt.figure() if res_n_components == 1: plt.scatter(pca_result[:, 0], pca_result[:, 0]) else: plt.scatter(pca_result[:, 0], pca_result[:, 1]) # plt.title('PCA result with two components') # plt.show() plt_two = plt2MD(plt) plt.clf() rb = ReportBuilder() rb.addMD( strip_margin(""" | | ### Plot | The x-axis and y-axis of the following plot is projected0 and projected1, respectively. | {image1} | | ### Result | {table1} | only showing top 20 rows | | ### Parameters | {parameter1} | | ### Components | {table2} | | ### Mean | {array1} | | ### Explained Variance | {array2} | """.format(table1=pandasDF2MD(out_df, 20), image1=plt_two, parameter1=dict2MD(res_get_param), table2=pandasDF2MD(res_components_df), array1=res_mean, array2=res_explained_variance))) model = _model_dict('pca') model['components'] = res_components model['explained_variance'] = res_explained_variance model['explained_variance_ratio'] = res_explained_variance_ratio model['singular_values'] = res_singular_values model['mean'] = res_mean model['n_components'] = res_n_components model['noise_variance'] = res_noise_variance model['parameters'] = res_get_param model['covariance'] = res_get_covariance model['precision'] = res_get_precision model['report'] = rb.get() model['pca_model'] = pca_model model['input_cols'] = input_cols out_df = pd.concat([table.reset_index(drop=True), out_df], axis=1) out_df.columns = table.columns.values.tolist() + column_names return {'out_table': out_df, 'model': model}
def paired_ttest(table, first_column, second_column, alternative, hypothesized_difference=0, confidence_level=0.95): df = len(table) - 1 diff_mean = abs(table[first_column] - table[second_column]).mean() std_dev = np.sqrt( ((diff_mean - abs(table[first_column] - table[second_column])) * (diff_mean - abs(table[first_column] - table[second_column]))).mean()) ans = stats.ttest_rel(table[first_column], table[second_column] + hypothesized_difference) t_value = ans[0] p_value_ul = ans[1] p_value_u = stats.t.sf(t_value, 149) p_value_l = stats.t.cdf(t_value, 149) left_u = diff_mean - std_dev * stats.t.isf( (1 - confidence_level), df) / np.sqrt(df) right_u = np.Infinity left_l = -np.Infinity right_l = diff_mean + std_dev * stats.t.isf( (1 - confidence_level), df) / np.sqrt(df) left_ul = diff_mean - std_dev * stats.t.isf( (1 - confidence_level) / 2, df) / np.sqrt(df) right_ul = diff_mean + std_dev * stats.t.isf( (1 - confidence_level) / 2, df) / np.sqrt(df) result_value_u = [{ 'data': first_column + " , " + second_column, 'alternative_hypothesis': "true difference in means > " + str(hypothesized_difference), 'statistics': "t statistics, t distribution with " + str(df) + " degrees of freedom under the null hypothesis", 'estimates': t_value, 'p_value': p_value_u, 'confidence_level': confidence_level, 'low_confidence_interval': left_u, 'upper_confidence_interval': right_u }] result_value_l = [{ 'data': first_column + " , " + second_column, 'alternative_hypothesis': "true difference in means < " + str(hypothesized_difference), 'statistics': "t statistics, t distribution with " + str(df) + " degrees of freedom under the null hypothesis", 'estimates': t_value, 'p_value': p_value_l, 'confidence_level': confidence_level, 'low_confidence_interval': left_l, 'upper_confidence_interval': right_l }] result_value_ul = [{ 'data': first_column + " , " + second_column, 'alternative_hypothesis': "true difference in means != " + str(hypothesized_difference), 'statistics': "t statistics, t distribution with " + str(df) + " degrees of freedom under the null hypothesis", 'estimates': t_value, 'p_value': p_value_ul, 'confidence_level': confidence_level, 'low_confidence_interval': left_ul, 'upper_confidence_interval': right_ul }] df_result = pd.DataFrame() df_u = pd.DataFrame(result_value_u, columns=[ 'data', 'alternative_hypothesis', 'statistics', 'estimates', 'p_value', 'confidence_level', 'low_confidence_interval', 'upper_confidence_interval' ]) df_l = pd.DataFrame(result_value_l, columns=[ 'data', 'alternative_hypothesis', 'statistics', 'estimates', 'p_value', 'confidence_level', 'low_confidence_interval', 'upper_confidence_interval' ]) df_ul = pd.DataFrame(result_value_ul, columns=[ 'data', 'alternative_hypothesis', 'statistics', 'estimates', 'p_value', 'confidence_level', 'low_confidence_interval', 'upper_confidence_interval' ]) if 'greater' in alternative: df_result = df_result.append(df_u, ignore_index=True) if 'less' in alternative: df_result = df_result.append(df_l, ignore_index=True) if 'twosided' in alternative: df_result = df_result.append(df_ul, ignore_index=True) params = { 'Input columns': first_column + ", " + second_column, 'Hypothesized difference': str(hypothesized_difference), 'Confidence level': str(confidence_level) } rb = ReportBuilder() rb.addMD( strip_margin(""" | ## Paired T Test Result | |df|mean_difference|standard_deviation|t_value |--|--|--|-- |{deg_f}|{dm}|{sd}|{tv} """.format(deg_f=df, dm=diff_mean, sd=std_dev, tv=t_value, params=dict2MD(params)))) if 'greater' in alternative: rb.addMD( strip_margin(""" | - H0 : true diffrence in means is less than or equal to {hd}. | - H1 : true diffrence in means is larger than {hd}. | |p_value|confidence_level|confidence_interval |--|--|-- |{pvu}|{con_lv}|({l_u}, {r_u}) | """.format(pvu=p_value_u, hd=str(hypothesized_difference), con_lv=str(confidence_level), l_u=left_u, r_u=right_u))) if 'less' in alternative: rb.addMD( strip_margin(""" | - H0 : true diffrence in means is larger than or equal to {hd}. | - H1 : true diffrence in means is less than {hd}. | |p_value|confidence_level|confidence_interval |--|--|-- |{pvl}|{con_lv}|({l_l}, {r_l}) | """.format(pvl=p_value_l, hd=str(hypothesized_difference), con_lv=str(confidence_level), l_l=left_l, r_l=right_l))) if 'twosided' in alternative: rb.addMD( strip_margin(""" | - H0 : true diffrence in means is equal to {hd}. | - H1 : true diffrence in means is not equal to {hd}. | |p_value|confidence_level|confidence_interval |--|--|-- |{pvul}|{con_lv}|({l_ul}, {r_ul}) | """.format(pvul=p_value_ul, hd=str(hypothesized_difference), con_lv=str(confidence_level), l_ul=left_ul, r_ul=right_ul))) model = dict() model['report'] = rb.get() return {'out_table': df_result, 'model': model}
def _decision_tree_classification_train( table, feature_cols, label_col, # fig_size=np.array([6.4, 4.8]), criterion='gini', splitter='best', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features=None, random_state=None, max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, class_weight=None, presort=False, sample_weight=None, check_input=True, X_idx_sorted=None): classifier = DecisionTreeClassifier( criterion, splitter, max_depth, min_samples_split, min_samples_leaf, min_weight_fraction_leaf, max_features, random_state, max_leaf_nodes, min_impurity_decrease, min_impurity_split, class_weight, presort) classifier.fit(table[feature_cols], table[label_col], sample_weight, check_input, X_idx_sorted) try: from sklearn.externals.six import StringIO from sklearn.tree import export_graphviz import pydotplus dot_data = StringIO() export_graphviz(classifier, out_file=dot_data, feature_names=feature_cols, class_names=table[label_col].astype('str').unique(), filled=True, rounded=True, special_characters=True) graph = pydotplus.graph_from_dot_data(dot_data.getvalue()) from brightics.common.report import png2MD fig_tree = png2MD(graph.create_png()) except: fig_tree = "Graphviz is needed to draw a Decision Tree graph. Please download it from http://graphviz.org/download/ and install it to your computer." # json model = _model_dict('decision_tree_classification_model') model['feature_cols'] = feature_cols model['label_col'] = label_col model['classes'] = classifier.classes_ feature_importance = classifier.feature_importances_ model['feature_importance'] = feature_importance model['max_features'] = classifier.max_features_ model['n_classes'] = classifier.n_classes_ model['n_features'] = classifier.n_features_ model['n_outputs'] = classifier.n_outputs_ model['tree'] = classifier.tree_ get_param = classifier.get_params() model['parameters'] = get_param model['classifier'] = classifier # report indices = np.argsort(feature_importance) sorted_feature_cols = np.array(feature_cols)[indices] plt.title('Feature Importances') plt.barh(range(len(indices)), feature_importance[indices], color='b', align='center') for i, v in enumerate(feature_importance[indices]): plt.text(v, i, " {:.2f}".format(v), color='b', va='center', fontweight='bold') plt.yticks(range(len(indices)), sorted_feature_cols) plt.xlabel('Relative Importance') plt.xlim(0, 1.1) plt.tight_layout() fig_feature_importances = plt2MD(plt) plt.clf() params = dict2MD(get_param) feature_importance_df = pd.DataFrame(data=feature_importance, index=feature_cols).T # Add tree plot rb = ReportBuilder() rb.addMD( strip_margin(""" | ## Decision Tree Classification Train Result | ### Decision Tree | {fig_tree} | | ### Feature Importance | {fig_feature_importances} | | ### Parameters | {list_parameters} | """.format(fig_tree=fig_tree, fig_feature_importances=fig_feature_importances, list_parameters=params))) model['report'] = rb.get() return {'model': model}
def _kmeans_train_predict(table, input_cols, n_clusters=3, prediction_col='prediction', init='k-means++', n_init=10, max_iter=300, tol=1e-4, precompute_distances='auto', seed=None, n_jobs=1, algorithm='auto', n_samples=None): inputarr = table[input_cols] if n_samples is None: n_samples = len(inputarr) k_means = SKKMeans(n_clusters=n_clusters, init=init, n_init=n_init, max_iter=max_iter, tol=tol, precompute_distances=precompute_distances, verbose=0, random_state=seed, copy_x=True, n_jobs=n_jobs, algorithm=algorithm) k_means.fit(inputarr) params = { 'input_cols': input_cols, 'n_clusters': n_clusters, 'init': init, 'n_init': n_init, 'max_iter': max_iter, 'tol': tol, 'precompute_distances': precompute_distances, 'seed': seed, 'n_jobs': n_jobs, 'algorithm': algorithm } cluster_centers = k_means.cluster_centers_ labels = k_means.labels_ pca2_model = PCA(n_components=2).fit(inputarr) pca2 = pca2_model.transform(inputarr) fig_centers = _kmeans_centers_plot(input_cols, cluster_centers) fig_samples = _kmeans_samples_plot(table, input_cols, n_samples, cluster_centers) fig_pca = _kmeans_pca_plot(labels, cluster_centers, pca2_model, pca2) rb = ReportBuilder() rb.addMD( strip_margin(""" | ## Kmeans Result | - Number of iterations run: {n_iter_}. | - Coordinates of cluster centers | {fig_cluster_centers} | - Samples | {fig_pca} | {fig_samples} | | ### Parameters | {params} """.format(n_iter_=k_means.n_iter_, fig_cluster_centers=fig_centers, fig_pca=fig_pca, fig_samples=fig_samples, params=dict2MD(params)))) model = _model_dict('kmeans') model['model'] = k_means model['input_cols'] = input_cols model['report'] = rb.get() out_table = table.copy() out_table[prediction_col] = labels return {'out_table': out_table, 'model': model}
def naive_bayes_train(table, feature_cols, label_col, alpha=1.0, fit_prior=True, class_prior=None): features = table[feature_cols] label = table[label_col] label_encoder = preprocessing.LabelEncoder() label_encoder.fit(label) label_correspond = label_encoder.transform(label) if class_prior is not None: tmp_class_prior = [0 for x in range(len(class_prior))] for elems in class_prior: tmp = elems.split(":") tmp_class_prior[label_encoder.transform([tmp[0] ])[0]] = float(tmp[1]) class_prior = tmp_class_prior nb_model = MultinomialNB(alpha, fit_prior, class_prior) nb_model.fit(features, label_correspond) prediction_correspond = nb_model.predict(features) get_param = dict() get_param['Lambda'] = alpha # get_param['Prior Probabilities of the Classes'] = class_prior # get_param['Fit Class Prior Probability'] = fit_prior get_param['Feature Columns'] = feature_cols get_param['Label Column'] = label_col cnf_matrix = confusion_matrix(label_correspond, prediction_correspond) plt.figure() plot_confusion_matrix(cnf_matrix, classes=label_encoder.classes_, title='Confusion Matrix') fig_confusion_matrix = plt2MD(plt) accuracy = nb_model.score(features, label_correspond) * 100 rb = ReportBuilder() rb.addMD( strip_margin(""" | ## Naive Bayes Classification Result | ### Parameters | {table_parameter} | ### Predicted vs Actual | {image1} | #### Accuacy = {accuracy}% | """.format(image1=fig_confusion_matrix, accuracy=accuracy, table_parameter=dict2MD(get_param)))) model = _model_dict('naive_bayes_model') model['features'] = feature_cols model['label_col'] = label_col model['label_encoder'] = label_encoder model['nb_model'] = nb_model model['report'] = rb.get() return {'model': model}
def _outlier_detection_tukey_carling(table, input_cols, outlier_method="tukey", multiplier=None, number_of_removal=1, choice='add_prediction', new_column_prefix='is_outlier_'): out_table = table.copy() if multiplier is None and outlier_method == "tukey": multiplier = 1.5 elif multiplier is None and outlier_method == "carling": multiplier = 2.3 mean = table.mean() q1s = table.quantile(0.25) q3s = table.quantile(0.75) iqrs = q3s - q1s new_column_names = ['{prefix}{col}'.format(prefix=new_column_prefix, col=col) for col in input_cols] def _tukey(x, q1, q3, iqr, multiplier): return 'out' if x < q1 - multiplier * iqr or x > q3 + multiplier * iqr else 'in' def _carling(x, mean, iqr, multiplier): return 'out' if x < mean - multiplier * iqr or x > mean + multiplier * iqr else 'in' if outlier_method == "tukey": for col in input_cols: output_col_name = '{prefix}{col}'.format(prefix=new_column_prefix, col=col) out_table[output_col_name] = table[col].apply(lambda _: _tukey(_, q1s[col], q3s[col], iqrs[col], multiplier)) elif outlier_method == "carling": if multiplier is None: multiplier = 2.3 for col in input_cols: output_col_name = '{prefix}{col}'.format(prefix=new_column_prefix, col=col) out_table[output_col_name] = table[col].apply(lambda _: _carling(_, mean[col], iqrs[col], multiplier)) prediction = out_table[new_column_names].apply(lambda row: np.sum(row == 'out') < number_of_removal, axis=1) rb = ReportBuilder() params = { 'Input Columns': input_cols, 'Outlier Method': outlier_method, 'Multiplier': multiplier, 'Number of Outliers in a Row': number_of_removal, 'Result Type': choice, 'New Column Prefix': new_column_prefix } rb.addMD(strip_margin(""" | ## Outlier Detection (Tukey/Carling) Result | ### Parameters | | {display_params} """.format(display_params=dict2MD(params)))) if choice == 'add_prediction': pass elif choice == 'remove_outliers': out_table = out_table.drop(new_column_names, axis=1) out_table = out_table[prediction.values] elif choice == 'both': out_table = out_table[prediction.values] model = _model_dict('outlier_detection_tukey_carling') model['params'] = params model['mean'] = mean model['q1'] = q1s model['q3'] = q3s model['iqr'] = iqrs model['multiplier'] = multiplier model['report'] = rb.get() return {'out_table': out_table, 'model' : model}
def _xgb_classification_train(table, feature_cols, label_col, max_depth=3, learning_rate=0.1, n_estimators=100, silent=True, objective='binary:logistic', booster='gbtree', n_jobs=1, nthread=None, gamma=0, min_child_weight=1, max_delta_step=0, subsample=1, colsample_bytree=1, colsample_bylevel=1, reg_alpha=0, reg_lambda=1, scale_pos_weight=1, base_score=0.5, random_state=0, seed=None, missing=None, sample_weight=None, eval_set=None, eval_metric=None, early_stopping_rounds=None, verbose=True, xgb_model=None, sample_weight_eval_set=None): classifier = XGBClassifier(max_depth, learning_rate, n_estimators, silent, objective, booster, n_jobs, nthread, gamma, min_child_weight, max_delta_step, subsample, colsample_bytree, colsample_bylevel, reg_alpha, reg_lambda, scale_pos_weight, base_score, random_state, seed, missing) classifier.fit(table[feature_cols], table[label_col], sample_weight, eval_set, eval_metric, early_stopping_rounds, verbose, xgb_model, sample_weight_eval_set) # json get_param = classifier.get_params() feature_importance = classifier.feature_importances_ # plt.rcdefaults() plot_importance(classifier) plt.tight_layout() fig_plot_importance = plt2MD(plt) plt.clf() # plt.rcParams['figure.dpi'] = figure_dpi # plot_tree(classifier) # fig_plot_tree_UT = plt2MD(plt) # plt.clf() # plt.rcParams['figure.dpi'] = figure_dpi # plot_tree(classifier, rankdir='LR') # fig_plot_tree_LR = plt2MD(plt) # plt.rcdefaults() # plt.clf() model = _model_dict('xgb_classification_model') model['feature_cols'] = feature_cols model['label_col'] = label_col model['parameters'] = get_param model['feature_importance'] = feature_importance model['classifier'] = classifier # report # get_param_list = [] # get_param_list.append(['feature_cols', feature_cols]) # get_param_list.append(['label_col', label_col]) params = dict2MD(get_param) # for key, value in get_param.items(): # temp = [key, value] # get_param_list.append(temp) # get_param_df = pd.DataFrame(data=get_param_list, columns=['parameter', 'value']) feature_importance_df = pd.DataFrame(data=feature_importance, index=feature_cols).T rb = ReportBuilder() rb.addMD( strip_margin(""" | ## XGB Classification Train Result | | ### Plot Importance | {fig_importance} | | ### Feature Importance | {table_feature_importance} | | ### Parameters | {list_parameters} | """.format(fig_importance=fig_plot_importance, table_feature_importance=pandasDF2MD(feature_importance_df, 20), list_parameters=params))) model['report'] = rb.get() return {'model': model}
def _hierarchical_clustering(table, input_cols, link='complete', met='euclidean', p=2, num_rows=20, figure_height=6.4, orient='right'): table = table.copy() df = table[input_cols] Z = linkage(df, method=link, metric=met) out_table = pd.DataFrame([]) out_table['linkage_step'] = [x + 1 for x in reversed(range(len(Z)))] out_table['joined_column1'] = ['pt_' + str(int(Z[:, 0][i])) for i in range(len(Z))] out_table['joined_column2'] = ['pt_' + str(int(Z[:, 1][i])) for i in range(len(Z))] out_table['name_of_clusters'] = ['CL_' + str(i + 1) for i in reversed(range(len(Z)))] out_table['distance'] = [distance for distance in Z[:, 2]] out_table['number_of_original'] = [int(entities) for entities in Z[:, 3]] # switch name of point to cluster name for i in range(len(Z)): if Z[:, 0][i] >= len(df) : out_table['joined_column1'][i] = out_table['name_of_clusters'][Z[:, 0][i] - len(df)] if Z[:, 1][i] >= len(df) : out_table['joined_column2'][i] = out_table['name_of_clusters'][Z[:, 1][i] - len(df)] out_table = out_table.reindex(index=out_table.index[::-1])[0:] out_table1 = out_table.head(num_rows) # calculate full dendrogram def _llf(id): n = len(df) if id < n: return 'pt_' + str(id) plt.figure(figsize=(8.4, figure_height)) _fancy_dendrogram( Z, truncate_mode='none', # show only the last p merged clusters (if another) get_leaves=True, orientation=orient, labels=True, leaf_label_func=_llf, leaf_rotation=45, leaf_font_size=5., show_contracted=False, # to get a distribution impression in truncated branches annotate_above=float(10), # useful in small plots so annotations don't overlap # max_d=distance_threshold, # will plot a horizontal cut-off line, max_d as in max_distance ) plt.title('Hierarchical Clustering Dendrogram') if orient=='top': plt.xlabel('Samples') plt.ylabel('Distance') elif orient=='right': plt.xlabel('Distance') plt.ylabel('Samples') plt2 = plt2MD(plt) plt.clf() rb = ReportBuilder() params = { 'Input Columns': input_cols, 'Linkage Method': link, 'Metric': met, 'Number of Rows in Linkage Matrix': num_rows } rb.addMD(strip_margin("""### Hierarchical Clustering Result""")) rb.addMD(strip_margin(""" |## Dendrogram | |{image} | |### Parameters | | {display_params} | |## Linkage Matrix | |{out_table1} | """.format(image=plt2, display_params=dict2MD(params), out_table1=pandasDF2MD(out_table1)))) model = _model_dict('hierarchical_clustering') model['model'] = Z model['input_cols'] = input_cols model['parameters'] = params model['outtable'] = out_table model['report'] = rb.get() return { 'model':model}
def _naive_bayes_train(table, feature_cols, label_col, alpha=1.0, fit_prior=True, class_prior=None): features = table[feature_cols] label = table[label_col] label_encoder = preprocessing.LabelEncoder() label_encoder.fit(label) label_correspond = label_encoder.transform(label) if class_prior is not None: tmp_class_prior = [0 for x in range(len(class_prior))] for elems in class_prior: tmp = elems.split(":") tmp_class_prior[label_encoder.transform([tmp[0] ])[0]] = float(tmp[1]) class_prior = tmp_class_prior nb_model = MultinomialNB(alpha, fit_prior, class_prior) nb_model.fit(features, label_correspond) class_log_prior = nb_model.class_log_prior_ feature_log_prob_ = nb_model.feature_log_prob_ tmp_result = np.hstack( (list(map(list, zip(*[label_encoder.classes_] + [class_log_prior]))), (feature_log_prob_))) column_names = ['labels', 'pi'] for feature_col in feature_cols: column_names += ['theta_' + feature_col] result_table = pd.DataFrame.from_records(tmp_result, columns=column_names) prediction_correspond = nb_model.predict(features) get_param = dict() get_param['Lambda'] = alpha # get_param['Prior Probabilities of the Classes'] = class_prior get_param['Fit Class Prior Probability'] = fit_prior get_param['Feature Columns'] = feature_cols get_param['Label Column'] = label_col cnf_matrix = confusion_matrix(label_correspond, prediction_correspond) plt.figure() _plot_confusion_matrix(cnf_matrix, classes=label_encoder.classes_, title='Confusion Matrix') fig_confusion_matrix = plt2MD(plt) accuracy = nb_model.score(features, label_correspond) * 100 rb = ReportBuilder() rb.addMD( strip_margin(""" | ## Naive Bayes Classification Result | | ### Model:Multinomial | {result_table} | ### Parameters | {table_parameter} | ### Predicted vs Actual | {image1} | #### Accuacy = {accuracy}% | """.format(image1=fig_confusion_matrix, accuracy=accuracy, result_table=pandasDF2MD(result_table), table_parameter=dict2MD(get_param)))) model = _model_dict('naive_bayes_model') model['features'] = feature_cols model['label_col'] = label_col model['label_encoder'] = label_encoder model['nb_model'] = nb_model model['report'] = rb.get() return {'model': model}