def agglomerative_clustering_train_predict(input_table, input_cols, n_clusters=3, affinity='euclidean', compute_full_tree=True, linkage='ward', prediction_col='prediction', figw=6.4, figh=4.8): inputarr = input_table[input_cols] agglomerative_clustering = SKAgglomerativeClustering( n_clusters=n_clusters, affinity=affinity, memory=None, connectivity=None, compute_full_tree=compute_full_tree, linkage=linkage) agglomerative_clustering.fit(inputarr) input_table[prediction_col] = agglomerative_clustering.labels_ children = agglomerative_clustering.children_ distance = np.arange(children.shape[0]) no_of_observations = np.arange(2, children.shape[0] + 2) linkage_matrix = np.column_stack([children, distance, no_of_observations]).astype(float) plt.figure(figsize=(figw, figh)) dendrogram(linkage_matrix) plot_dendrogram = plt2MD(plt) plt.clf() rb = ReportBuilder() rb.addMD( strip_margin(""" | ## Agglomerative Clustering Result | {plot_dendrogram} """.format(plot_dendrogram=plot_dendrogram))) agglomerative_clustering_result = { 'model': agglomerative_clustering, 'input_cols': input_cols, 'report': rb.get() } return { 'out_table': input_table, 'agglomerative_result': agglomerative_clustering_result }
def kmeans_train_predict(table, input_cols, n_clusters=3, prediction_col='prediction', init='k-means++', n_init=10, max_iter=300, tol=1e-4, precompute_distances='auto', seed=None, n_jobs=1, algorithm='auto', n_samples=None): inputarr = table[input_cols] if n_samples is None: n_samples = len(inputarr) k_means = SKKMeans(n_clusters=n_clusters, init=init, n_init=n_init, max_iter=max_iter, tol=tol, precompute_distances=precompute_distances, verbose=0, random_state=seed, copy_x=True, n_jobs=n_jobs, algorithm=algorithm) k_means.fit(inputarr) params = {'input_cols':input_cols, 'n_clusters':n_clusters, 'init':init, 'n_init':n_init, 'max_iter':max_iter, 'tol':tol, 'precompute_distances':precompute_distances, 'seed':seed, 'n_jobs':n_jobs, 'algorithm':algorithm} cluster_centers = k_means.cluster_centers_ labels = k_means.labels_ pca2_model = PCA(n_components=2).fit(inputarr) pca2 = pca2_model.transform(inputarr) fig_centers = _kmeans_centers_plot(input_cols, cluster_centers) fig_samples = _kmeans_samples_plot(table, input_cols, n_samples, cluster_centers) fig_pca = _kmeans_pca_plot(labels, cluster_centers, pca2_model, pca2) rb = ReportBuilder() rb.addMD(strip_margin(""" | ## Kmeans Result | - Number of iterations run: {n_iter_}. | - Coordinates of cluster centers | {fig_cluster_centers} | - Samples | {fig_pca} | {fig_samples} | | ### Parameters | {params} """.format(n_iter_=k_means.n_iter_, fig_cluster_centers=fig_centers, fig_pca=fig_pca, fig_samples=fig_samples, params=dict2MD(params)))) model = _model_dict('kmeans') model['model'] = k_means model['input_cols'] = input_cols model['report'] = rb.get() out_table = table.copy() out_table[prediction_col] = labels return {'out_table':out_table, 'model':model}
def _evaluate_regression(table, label_col, prediction_col): label = table[label_col] predict = table[prediction_col] # compute metrics evs = explained_variance_score(label, predict) mae = mean_absolute_error(label, predict) mse = mean_squared_error(label, predict) mdae = median_absolute_error(label, predict) r2 = r2_score(label, predict) # json summary = dict() summary['label_col'] = label_col summary['prediction_col'] = prediction_col summary['r2_score'] = r2 summary['mean_squared_error'] = mse summary['mean_absolute_error'] = mae summary['median_absolute_error'] = mdae summary['explained_variance_score'] = evs # report all_dict_list = [{ 'r2_score': r2, 'mean_squared_error': mse, 'mean_absolute_error': mae, 'median_absolute_error': mdae, 'explained_variance_score': evs }] all_df = pd.DataFrame(all_dict_list) all_df = all_df[[ 'r2_score', 'mean_squared_error', 'mean_absolute_error', 'median_absolute_error', 'explained_variance_score' ]] summary['all'] = all_df rb = ReportBuilder() rb.addMD( strip_margin(""" | ## Evaluate Regression Result | ### Metrics | {table1} | | """.format(table1=pandasDF2MD(all_df)))) summary['report'] = rb.get() return {'result': summary}
def _svm_classification_train(table, feature_cols, label_col, c=1.0, kernel='rbf', degree=3, gamma='auto', coef0=0.0, shrinking=True, probability=True, tol=1e-3, max_iter=-1, random_state=None): _table = table.copy() _feature_cols = _table[feature_cols] _label_col = _table[label_col] _svc = svm.SVC(C=c, kernel=kernel, degree=degree, gamma=gamma, coef0=coef0, shrinking=shrinking, probability=probability, tol=tol, max_iter=max_iter, random_state=random_state) _svc_model = _svc.fit(_feature_cols, _label_col) get_param = _svc.get_params() get_param['feature_cols'] = feature_cols get_param['label_col'] = label_col rb = ReportBuilder() rb.addMD( strip_margin(""" | ## SVM Classification Result | ### Parameters | {table_parameter} """.format(table_parameter=dict2MD(get_param)))) _model = _model_dict('svc_model') _model['svc_model'] = _svc_model _model['features'] = feature_cols _model['report'] = rb.get() return {'model': _model}
def tukeys_range_test(table, response_cols, factor_col, alpha=0.05): rb = ReportBuilder() rb.addMD("""## Tukey's range test Result""") for response_col in response_cols: data = table[response_col] posthoc = pairwise_tukeyhsd(data, table[factor_col], alpha=alpha) posthoc_html = posthoc._results_table.as_html() posthoc.plot_simultaneous() rb.addMD("""### {response_col}""".format(response_col=response_col)) rb.addHTML(posthoc_html) rb.addPlt(plt) plt.clf() return {'result': {'report': rb.get()}}
def _hierarchical_clustering_post(table, model, num_clusters, cluster_col='prediction'): Z = model['model'] input_cols = model['input_cols'] params = model['parameters'] out_table = model['outtable'] predict = fcluster(Z, t=num_clusters, criterion='maxclust') out_table2 = table.copy() out_table2[cluster_col] = predict L, M = leaders(Z, predict) which_cluster = [] for leader in L: if leader in Z[:, 0]: select_indices = np.where(Z[:, 0] == leader)[0][0] which_cluster.append(out_table['joined_column1'][select_indices]) elif leader in Z[:, 1]: select_indices = np.where(Z[:, 1] == leader)[0][0] which_cluster.append(out_table['joined_column2'][select_indices]) out_table3 = pd.DataFrame([]) out_table3[cluster_col] = M out_table3['name_of_clusters'] = which_cluster out_table3 = out_table3.sort_values(cluster_col) cluster_count = np.bincount(out_table2[cluster_col]) cluster_count = cluster_count[cluster_count != 0] # data = {'cluster_name': ['prediction' + str(i) for i in range(1, num_clusters + 1)]} out_table3['num_of_entities'] = list(cluster_count) rb = ReportBuilder() rb.addMD(strip_margin("""### Hierarchical Clustering Post Process Result""")) rb.addMD(strip_margin(""" |### Parameters | |{display_params} | |## Clusters Information | |{out_table3} | """.format(display_params=dict2MD(params), out_table3=pandasDF2MD(out_table3)))) model = _model_dict('hierarchical_clustering_post') model['report'] = rb.get() return {'out_table2' : out_table2, 'model': model}
def _profile_table(table, bins=10, check_correlation=False, correlation_threshold=0.9, correlation_overrides=None): rb = ReportBuilder() profile = pd_profiling.ProfileReport( table, bins=bins, check_correlation=check_correlation, correlation_threshold=correlation_threshold, correlation_overrides=correlation_overrides) rb.addHTML(profile.html) summary = dict() summary['report'] = rb.get() return {'result': summary}
def _plot_roc_pr_curve(table, label_col, probability_col, fig_size=[6.4, 4.8], pos_label=None): label = table[label_col] probability = table[probability_col] threshold, fig_tpr_fpr, fig_roc, fig_precision_recall, fig_pr, fig_confusion = _plot_binary( label, probability, fig_size=(fig_size[0], fig_size[1]), pos_label=pos_label) summary = dict() summary['threshold'] = threshold summary['label_col'] = label_col summary['probability_col'] = probability_col rb = ReportBuilder() rb.addMD( strip_margin(""" | ## Plot ROC Curve and PR Curve Result | | ### ROC Curve | {fig_tpr_fpr} | {fig_roc} | | ### PR Curve | {fig_precision_recall} | {fig_pr} | | ### Confusion Matrix | {fig_confusion} """.format(fig_roc=fig_roc, fig_tpr_fpr=fig_tpr_fpr, fig_pr=fig_pr, fig_precision_recall=fig_precision_recall, fig_confusion=fig_confusion))) summary['report'] = rb.get() return {'result': summary}
def _function_by_group(function, table, model=None, group_by=None, **params): if isinstance(table, pd.DataFrame) and group_by is not None: table, group_keys = _group(table, group_by) else: group_keys = table['_grouped_data'].keys() sample_group = group_keys[0] if model is None: sample_result = function(table=table['_grouped_data'][sample_group], **params) else: sample_result = function(table=table['_grouped_data'][sample_group], model=model['_grouped_data'][sample_group], **params) res_keys = sample_result.keys() df_keys = [k for k, v in sample_result.items() if isinstance(v, pd.DataFrame)] model_keys_containing_repr = [k for k, v in sample_result.items() if isinstance(v, dict) and 'report' in v] res_dict = dict() for res_key in res_keys: res_dict[res_key] = {'_grouped_data':dict()} for group in group_keys: if model is None: res_group = function(table=table['_grouped_data'][group], **params) else: res_group = function(table=table['_grouped_data'][group], model=model['_grouped_data'][group], **params) for res_key in res_keys: res_dict[res_key]['_grouped_data'][group] = res_group[res_key] for repr_key in model_keys_containing_repr: rb = ReportBuilder() for group in group_keys: rb.addMD('{group}'.format(group=group)) rb.merge(res_dict[repr_key]['_grouped_data'][group]['report']) res_dict[repr_key]['report'] = rb.get() for df_key in df_keys: res_dict[df_key] = _flatten(res_dict[df_key]) return res_dict
def _outlier_detection_lof(table, input_cols, choice='add_prediction', n_neighbors=20, new_column_name='is_outlier'): # algorithm='auto', leaf_size=30, # metric='minkowski', p=2, contamination=0.1, out_table = table.copy() lof_model = LocalOutlierFactor(n_neighbors, algorithm='auto', leaf_size=30, metric='minkowski', p=2, contamination=0.1) lof_model.fit_predict(out_table[input_cols]) isinlier = lambda _: 'in' if _ == 1 else 'out' out_table[new_column_name] = [isinlier(lof_predict) for lof_predict in lof_model.fit_predict(out_table[input_cols])] if choice == 'add_prediction': pass elif choice == 'remove_outliers': out_table = out_table[out_table[new_column_name] == 'in'] out_table = out_table.drop(new_column_name, axis=1) elif choice == 'both': out_table = out_table[out_table[new_column_name] == 'in'] params = { 'Input Columns': input_cols, 'Result Type': choice, 'Number of Neighbors': n_neighbors, # 'Algorithm': algorithm, # 'Metric': metric, # 'Contamination': contamination } rb = ReportBuilder() rb.addMD(strip_margin(""" | ## Outlier Detection (Local Outlier Factor) Result | ### Parameters | | {display_params} """.format(display_params=dict2MD(params)))) model = _model_dict('outlier_detection_lof') model['params'] = params model['lof_model'] = lof_model model['report'] = rb.get() return {'out_table':out_table, 'model':model}
def bartletts_test(table, response_cols, factor_col): groups = table[factor_col].unique() data_list = [] stat_list = [] p_list = [] for response_col in response_cols: response = table[response_col] stat_bart, p_bart = bartlett( *[response[table[factor_col] == group] for group in groups]) data = '{response_col} by {factor_col}'.format( response_col=response_col, factor_col=factor_col) data_list.append(data) stat_list.append(stat_bart) p_list.append(p_bart) result_table = pd.DataFrame.from_items([['data', data_list], ['estimate', stat_list], ['p_value', p_list]]) result = dict() result['result_table'] = result_table rb = ReportBuilder() rb.addMD( strip_margin(""" ## Bartlett's Test Result | - H0: k population variances are equal. | - H1: at least two variances are different. | | {result_table} """.format(result_table=pandasDF2MD(result_table)))) result['report'] = rb.get() return {'result': result}
def tfidf_train(table, tokens_col, tf_weighing='n', df_weighing='t', document_normalization='c'): out_table = table.copy() _corpus = out_table[tokens_col] _smartirs = tf_weighing + df_weighing + document_normalization _dictionary = Dictionary(_corpus) _corpus = [_dictionary.doc2bow(text) for text in _corpus] _model = TfidfModel(_corpus, smartirs=_smartirs) _corpus = [text for text in _model[_corpus]] _sparse_matrix = corpus2csc(_corpus, num_terms=len(_dictionary.token2id)).T _values = [value for value in _dictionary.values()] _keys = [key for key in _dictionary.keys()] _dic = pd.DataFrame({'indice': _keys, 'word': _values}) rb = ReportBuilder() rb.addMD( strip_margin(""" | ## Dictionary | {table1} """.format(table1=pandasDF2MD(_dic)))) out_table['sparse_vectors'] = sparse_encode( _sparse_matrix)['sparse_vectors'] fit_model = dict() fit_model['dictionary'] = _dictionary fit_model['model'] = _model fit_model['report'] = rb.get() return {'out_table': out_table, 'fit_model': fit_model}
def _pca(table, input_cols, new_column_name='projected_', n_components=None, copy=True, whiten=False, svd_solver='auto', tol=0.0, iterated_power='auto', random_state=None): num_feature_cols = len(input_cols) if n_components is None: n_components = num_feature_cols pca = PCA(n_components, copy, whiten, svd_solver, tol, iterated_power, random_state) pca_model = pca.fit(table[input_cols]) column_names = [] for i in range(0, n_components): column_names.append(new_column_name + str(i)) # print(column_names) pca_result = pca_model.transform(table[input_cols]) out_df = pd.DataFrame(data=pca_result, columns=[column_names]) res_components = pca_model.components_ res_components_df = pd.DataFrame(data=res_components, columns=[input_cols]) res_explained_variance = pca_model.explained_variance_ res_explained_variance_ratio = pca_model.explained_variance_ratio_ res_singular_values = pca_model.singular_values_ res_mean = pca_model.mean_ res_n_components = pca_model.n_components_ res_noise_variance = pca_model.noise_variance_ res_get_param = pca_model.get_params() res_get_covariance = pca_model.get_covariance() res_get_precision = pca_model.get_precision() # visualization plt.figure() if res_n_components == 1: plt.scatter(pca_result[:, 0], pca_result[:, 0]) else: plt.scatter(pca_result[:, 0], pca_result[:, 1]) # plt.title('PCA result with two components') # plt.show() plt_two = plt2MD(plt) plt.clf() rb = ReportBuilder() rb.addMD( strip_margin(""" | | ### Plot | The x-axis and y-axis of the following plot is projected0 and projected1, respectively. | {image1} | | ### Result | {table1} | only showing top 20 rows | | ### Parameters | {parameter1} | | ### Components | {table2} | | ### Mean | {array1} | | ### Explained Variance | {array2} | """.format(table1=pandasDF2MD(out_df, 20), image1=plt_two, parameter1=dict2MD(res_get_param), table2=pandasDF2MD(res_components_df), array1=res_mean, array2=res_explained_variance))) model = _model_dict('pca') model['components'] = res_components model['explained_variance'] = res_explained_variance model['explained_variance_ratio'] = res_explained_variance_ratio model['singular_values'] = res_singular_values model['mean'] = res_mean model['n_components'] = res_n_components model['noise_variance'] = res_noise_variance model['parameters'] = res_get_param model['covariance'] = res_get_covariance model['precision'] = res_get_precision model['report'] = rb.get() model['pca_model'] = pca_model model['input_cols'] = input_cols out_df = pd.concat([table.reset_index(drop=True), out_df], axis=1) out_df.columns = table.columns.values.tolist() + column_names return {'out_table': out_df, 'model': model}
def paired_ttest(table, first_column, second_column, alternative, hypothesized_difference=0, confidence_level=0.95): df = len(table) - 1 diff_mean = abs(table[first_column] - table[second_column]).mean() std_dev = np.sqrt( ((diff_mean - abs(table[first_column] - table[second_column])) * (diff_mean - abs(table[first_column] - table[second_column]))).mean()) ans = stats.ttest_rel(table[first_column], table[second_column] + hypothesized_difference) t_value = ans[0] p_value_ul = ans[1] p_value_u = stats.t.sf(t_value, 149) p_value_l = stats.t.cdf(t_value, 149) left_u = diff_mean - std_dev * stats.t.isf( (1 - confidence_level), df) / np.sqrt(df) right_u = np.Infinity left_l = -np.Infinity right_l = diff_mean + std_dev * stats.t.isf( (1 - confidence_level), df) / np.sqrt(df) left_ul = diff_mean - std_dev * stats.t.isf( (1 - confidence_level) / 2, df) / np.sqrt(df) right_ul = diff_mean + std_dev * stats.t.isf( (1 - confidence_level) / 2, df) / np.sqrt(df) result_value_u = [{ 'data': first_column + " , " + second_column, 'alternative_hypothesis': "true difference in means > " + str(hypothesized_difference), 'statistics': "t statistics, t distribution with " + str(df) + " degrees of freedom under the null hypothesis", 'estimates': t_value, 'p_value': p_value_u, 'confidence_level': confidence_level, 'low_confidence_interval': left_u, 'upper_confidence_interval': right_u }] result_value_l = [{ 'data': first_column + " , " + second_column, 'alternative_hypothesis': "true difference in means < " + str(hypothesized_difference), 'statistics': "t statistics, t distribution with " + str(df) + " degrees of freedom under the null hypothesis", 'estimates': t_value, 'p_value': p_value_l, 'confidence_level': confidence_level, 'low_confidence_interval': left_l, 'upper_confidence_interval': right_l }] result_value_ul = [{ 'data': first_column + " , " + second_column, 'alternative_hypothesis': "true difference in means != " + str(hypothesized_difference), 'statistics': "t statistics, t distribution with " + str(df) + " degrees of freedom under the null hypothesis", 'estimates': t_value, 'p_value': p_value_ul, 'confidence_level': confidence_level, 'low_confidence_interval': left_ul, 'upper_confidence_interval': right_ul }] df_result = pd.DataFrame() df_u = pd.DataFrame(result_value_u, columns=[ 'data', 'alternative_hypothesis', 'statistics', 'estimates', 'p_value', 'confidence_level', 'low_confidence_interval', 'upper_confidence_interval' ]) df_l = pd.DataFrame(result_value_l, columns=[ 'data', 'alternative_hypothesis', 'statistics', 'estimates', 'p_value', 'confidence_level', 'low_confidence_interval', 'upper_confidence_interval' ]) df_ul = pd.DataFrame(result_value_ul, columns=[ 'data', 'alternative_hypothesis', 'statistics', 'estimates', 'p_value', 'confidence_level', 'low_confidence_interval', 'upper_confidence_interval' ]) if 'greater' in alternative: df_result = df_result.append(df_u, ignore_index=True) if 'less' in alternative: df_result = df_result.append(df_l, ignore_index=True) if 'twosided' in alternative: df_result = df_result.append(df_ul, ignore_index=True) params = { 'Input columns': first_column + ", " + second_column, 'Hypothesized difference': str(hypothesized_difference), 'Confidence level': str(confidence_level) } rb = ReportBuilder() rb.addMD( strip_margin(""" | ## Paired T Test Result | |df|mean_difference|standard_deviation|t_value |--|--|--|-- |{deg_f}|{dm}|{sd}|{tv} """.format(deg_f=df, dm=diff_mean, sd=std_dev, tv=t_value, params=dict2MD(params)))) if 'greater' in alternative: rb.addMD( strip_margin(""" | - H0 : true diffrence in means is less than or equal to {hd}. | - H1 : true diffrence in means is larger than {hd}. | |p_value|confidence_level|confidence_interval |--|--|-- |{pvu}|{con_lv}|({l_u}, {r_u}) | """.format(pvu=p_value_u, hd=str(hypothesized_difference), con_lv=str(confidence_level), l_u=left_u, r_u=right_u))) if 'less' in alternative: rb.addMD( strip_margin(""" | - H0 : true diffrence in means is larger than or equal to {hd}. | - H1 : true diffrence in means is less than {hd}. | |p_value|confidence_level|confidence_interval |--|--|-- |{pvl}|{con_lv}|({l_l}, {r_l}) | """.format(pvl=p_value_l, hd=str(hypothesized_difference), con_lv=str(confidence_level), l_l=left_l, r_l=right_l))) if 'twosided' in alternative: rb.addMD( strip_margin(""" | - H0 : true diffrence in means is equal to {hd}. | - H1 : true diffrence in means is not equal to {hd}. | |p_value|confidence_level|confidence_interval |--|--|-- |{pvul}|{con_lv}|({l_ul}, {r_ul}) | """.format(pvul=p_value_ul, hd=str(hypothesized_difference), con_lv=str(confidence_level), l_ul=left_ul, r_ul=right_ul))) model = dict() model['report'] = rb.get() return {'out_table': df_result, 'model': model}
def _logistic_regression_train(table, feature_cols, label_col, penalty='l2', dual=False, tol=0.0001, C=1.0, fit_intercept=True, intercept_scaling=1, class_weight=None, random_state=None, solver='liblinear', max_iter=100, multi_class='ovr', verbose=0, warm_start=False, n_jobs=1): features = table[feature_cols] label = table[label_col] lr_model = LogisticRegression(penalty, dual, tol, C, fit_intercept, intercept_scaling, class_weight, random_state, solver, max_iter, multi_class, verbose, warm_start, n_jobs) lr_model.fit(features, label) featureNames = np.append("Intercept", feature_cols) intercept = lr_model.intercept_ coefficients = lr_model.coef_ classes = lr_model.classes_ is_binary = len(classes) == 2 if (fit_intercept == True): summary = pd.DataFrame({'features': ['intercept'] + feature_cols}) print(intercept) print(coefficients) coef_trans = np.concatenate(([intercept], np.transpose(coefficients)), axis=0) if not is_binary: summary = pd.concat( (summary, pd.DataFrame(coef_trans, columns=classes)), axis=1) elif is_binary: summary = pd.concat( (summary, pd.DataFrame(coef_trans, columns=[classes[0]])), axis=1) else: summary = pd.DataFrame({'features': feature_cols}) coef_trans = np.transpose(coefficients) if not is_binary: summary = pd.concat( (summary, pd.DataFrame(coef_trans, columns=classes)), axis=1) elif is_binary: summary = pd.concat( (summary, pd.DataFrame(coef_trans, columns=[classes[0]])), axis=1) prob = lr_model.predict_proba(features) rb = ReportBuilder() rb.addMD( strip_margin(""" | ## Logistic Regression Result | ### Summary | {table1} """.format(table1=pandasDF2MD(summary)))) model = dict() model['features'] = feature_cols model['label'] = label_col model['intercept'] = lr_model.intercept_ model['coefficients'] = lr_model.coef_ model['class'] = lr_model.classes_ model['penalty'] = penalty model['solver'] = solver model['lr_model'] = lr_model model['report'] = rb.get() return {'model': model}
def one_sample_ttest(table, input_cols, alternatives, hypothesized_mean=0, conf_level=0.95): n = len(table) degree = n - 1 alpha = 1.0 - conf_level out_table = pd.DataFrame() # statistics statistics = "t statistic, t distribution with %d degrees of freedom under the null hypothesis." % degree # Print model rb = ReportBuilder() rb.addMD( strip_margin(""" ## One Sample T Test Result | - Statistics = {s} | - Hypothesized mean = {h} | - Confidence level = {cl} """.format(s=statistics, h=hypothesized_mean, cl=conf_level))) for input_col in input_cols: # model alter_list = [] p_list = [] CI_list = [] # data data = input_col # estimates result = stats.ttest_1samp(table[input_col], hypothesized_mean) estimates = result[0] cols = [ 'data', 'alternative_hypothesis', 'statistics', 'estimates', 'p_value', 'confidence_level', 'lower_confidence_interval', 'upper_confidence_interval' ] for i in alternatives: if (i == 'Greater'): # alternative hypothesis alternative_hypothesis = "true mean >" + str(hypothesized_mean) # p-values p_value = 1.0 - t.cdf(estimates, degree) # confidence interval - greater critical_val = t.ppf(1.0 - alpha, degree) width = critical_val * np.std( table[input_col]) / math.sqrt(n - 1) lower_conf_interval = np.mean(table[input_col]) - width upper_conf_interval = math.inf # model alter = 'true mean > {hypothesized_mean}'.format( hypothesized_mean=hypothesized_mean) alter_list.append(alter) p_list.append(p_value) conf_interval = '({lower_conf_interval}, {upper_conf_interval})'.format( lower_conf_interval=lower_conf_interval, upper_conf_interval=upper_conf_interval) CI_list.append(conf_interval) # out_table list = [] list.append([ data, alternative_hypothesis, statistics, estimates, p_value, conf_level, lower_conf_interval, upper_conf_interval ]) out_table = out_table.append(pd.DataFrame(list, columns=cols)) if (i == 'Less'): # alternative hypothesis alternative_hypothesis = "true mean <" + str(hypothesized_mean) p_value = t.cdf(estimates, degree) # confidence interval - less critical_val = t.ppf(1.0 - alpha, degree) width = critical_val * np.std( table[input_col]) / math.sqrt(n - 1) lower_conf_interval = -math.inf upper_conf_interval = np.mean(table[input_col]) + width # model alter = 'true mean < {hypothesized_mean}'.format( hypothesized_mean=hypothesized_mean) alter_list.append(alter) p_list.append(p_value) conf_interval = '({lower_conf_interval}, {upper_conf_interval})'.format( lower_conf_interval=lower_conf_interval, upper_conf_interval=upper_conf_interval) CI_list.append(conf_interval) # out_table list = [] list.append([ data, alternative_hypothesis, statistics, estimates, p_value, conf_level, lower_conf_interval, upper_conf_interval ]) out_table = out_table.append(pd.DataFrame(list, columns=cols)) if (i == 'Two Sided'): # alternative hypothesis alternative_hypothesis = "true mean !=" + str( hypothesized_mean) # p_value = (1.0 - t.cdf(abs(estimates), degree)) * 2.0 if (estimates >= 0): p_value = 2.0 * t.cdf(-estimates, degree) else: p_value = 2.0 * t.cdf(estimates, degree) # confidence interval - two-sided critical_val = t.ppf(1.0 - alpha / 2, degree) width = critical_val * np.std( table[input_col]) / math.sqrt(n - 1) lower_conf_interval = np.mean(table[input_col]) - width upper_conf_interval = np.mean(table[input_col]) + width # model alter = 'true mean != {hypothesized_mean}'.format( hypothesized_mean=hypothesized_mean) alter_list.append(alter) p_list.append(p_value) conf_interval = '({lower_conf_interval}, {upper_conf_interval})'.format( lower_conf_interval=lower_conf_interval, upper_conf_interval=upper_conf_interval) CI_list.append(conf_interval) # out_table list = [] list.append([ data, alternative_hypothesis, statistics, estimates, p_value, conf_level, lower_conf_interval, upper_conf_interval ]) out_table = out_table.append(pd.DataFrame(list, columns=cols)) # Print model conf_level_percent = conf_level * 100 result_table = pd.DataFrame.from_items( [['alternative hypothesis', alter_list], ['p-value', p_list], ['%g%% confidence Interval' % conf_level_percent, CI_list]]) result = dict() result['result_table'] = result_table rb.addMD( strip_margin(""" ### Data = {input_col} | - Estimates = {estimates} | | {result_table} """.format(input_col=input_col, estimates=estimates, result_table=pandasDF2MD(result_table)))) # print model result['report'] = rb.get() return {'out_table': out_table, 'model': result}
def two_sample_ttest_for_stacked_data(table, response_cols, factor_col, alternatives, first, second, hypo_diff=0, equal_vari='pooled', confi_level=0.95): if (type(table[factor_col][0]) == str): table_first = table[table[factor_col] == first] table_second = table[table[factor_col] == second] elif (type(table[factor_col][0]) == bool): table_first = table[table[factor_col] == bool(first)] table_second = table[table[factor_col] == bool(second)] else: table_first = table[table[factor_col] == float(first)] table_second = table[table[factor_col] == float(second)] tmp_table = [] rb = ReportBuilder() rb.addMD( strip_margin(""" ## Two Sample T Test for Stacked Data Result | - Hypothesized mean = {hypo_diff} | - Confidence level = {confi_level} """.format(hypo_diff=hypo_diff, confi_level=confi_level))) for response_col in response_cols: tmp_model = [] number1 = len(table_first[response_col]) number2 = len(table_second[response_col]) mean1 = (table_first[response_col]).mean() mean2 = (table_second[response_col]).mean() std1 = (table_first[response_col]).std() std2 = (table_second[response_col]).std() start_auto = 0 if (equal_vari == 'auto'): start_auto = 1 f_value = (std1**2) / (std2**2) f_test_p_value_tmp = stats.f.cdf(1 / f_value, number1 - 1, number2 - 1) if (f_test_p_value_tmp > 0.5): f_test_p_value = (1 - f_test_p_value_tmp) * 2 else: f_test_p_value = f_test_p_value_tmp * 2 if (f_test_p_value < 0.05): equal_vari = 'unequal' else: equal_vari = 'pooled' ttestresult = ttest_ind(table_first[response_col], table_second[response_col], 'larger', usevar=equal_vari, value=hypo_diff) if 'larger' in alternatives: ttestresult = ttest_ind(table_first[response_col], table_second[response_col], 'larger', usevar=equal_vari, value=hypo_diff) df = ttestresult[2] if (equal_vari == 'pooled'): std_number1number2 = sqrt( ((number1 - 1) * (std1)**2 + (number2 - 1) * (std2)**2) / (number1 + number2 - 2)) margin = t.ppf( (confi_level), df) * std_number1number2 * sqrt(1 / number1 + 1 / number2) if (equal_vari == 'unequal'): margin = t.ppf( (confi_level), df) * sqrt(std1**2 / (number1) + std2**2 / (number2)) tmp_model += [['true difference in means > 0.0'] + [ttestresult[1]] + [(mean1 - mean2 - margin, math.inf)]] tmp_table += [[ '%s by %s(%s,%s)' % (response_col, factor_col, first, second) ] + ['true difference in means > 0.0'] + [ 't statistic, t distribution with %f degrees of freedom under the null hypothesis' % ttestresult[2] ] + [ttestresult[0]] + [ttestresult[1]] + [confi_level] + [mean1 - mean2 - margin] + [math.inf]] if 'smaller' in alternatives: ttestresult = ttest_ind(table_first[response_col], table_second[response_col], 'smaller', usevar=equal_vari, value=hypo_diff) df = ttestresult[2] if (equal_vari == 'pooled'): std_number1number2 = sqrt( ((number1 - 1) * (std1)**2 + (number2 - 1) * (std2)**2) / (number1 + number2 - 2)) margin = t.ppf( (confi_level), df) * std_number1number2 * sqrt(1 / number1 + 1 / number2) if (equal_vari == 'unequal'): margin = t.ppf( (confi_level), df) * sqrt(std1**2 / (number1) + std2**2 / (number2)) tmp_model += [['true difference in means < 0.0'] + [ttestresult[1]] + [(-math.inf, mean1 - mean2 + margin)]] tmp_table += [[ '%s by %s(%s,%s)' % (response_col, factor_col, first, second) ] + ['true difference in means < 0.0'] + [ 't statistic, t distribution with %f degrees of freedom under the null hypothesis' % ttestresult[2] ] + [ttestresult[0]] + [ttestresult[1]] + [confi_level] + [-math.inf] + [mean1 - mean2 + margin]] if 'two-sided' in alternatives: ttestresult = ttest_ind(table_first[response_col], table_second[response_col], 'two-sided', usevar=equal_vari, value=hypo_diff) df = ttestresult[2] if (equal_vari == 'pooled'): std_number1number2 = sqrt( ((number1 - 1) * (std1)**2 + (number2 - 1) * (std2)**2) / (number1 + number2 - 2)) margin = t.ppf( (confi_level), df) * std_number1number2 * sqrt(1 / number1 + 1 / number2) if (equal_vari == 'unequal'): margin = t.ppf( (confi_level), df) * sqrt(std1**2 / (number1) + std2**2 / (number2)) tmp_model += [['true difference in means != 0.0'] + [ttestresult[1]] + [(mean1 - mean2 - margin, mean1 - mean2 + margin)]] tmp_table += [[ '%s by %s(%s,%s)' % (response_col, factor_col, first, second) ] + ['true difference in means != 0.0'] + [ 't statistic, t distribution with %f degrees of freedom under the null hypothesis' % ttestresult[2] ] + [ttestresult[0]] + [ttestresult[1]] + [confi_level] + [mean1 - mean2 - margin] + [mean1 - mean2 + margin]] result_model = pd.DataFrame.from_records(tmp_model) result_model.columns = [ 'alternatives', 'p values', '%g%% confidence interval' % (confi_level * 100) ] rb.addMD( strip_margin(""" | #### Data = {response_col} by {factor_col}({first},{second}) | - Statistics = t statistic, t distribution with {ttestresult2} degrees of freedom under the null hypothesis | - Estimates= {ttestresult0} | | {result_model} | """.format(ttestresult2=ttestresult[2], response_col=response_col, factor_col=factor_col, first=first, second=second, ttestresult0=ttestresult[0], result_model=pandasDF2MD(result_model)))) if (start_auto == 1): equal_vari = 'auto' result = pd.DataFrame.from_records(tmp_table) result.columns = [ 'data', 'alternative_hypothesis', 'statistics', 'estimates', 'p_value', 'confidence_level', 'lower_confidence_interval', 'upper_confidence_interval' ] model = dict() model['report'] = rb.get() return {'out_table': result, 'model': model}
def oneway_anova(table, response_cols, factor_col): rb = ReportBuilder() rb.addMD( strip_margin(""" ## One-way Analysis of Variance Result """)) groups = table[factor_col].unique() groups.sort() sum_len = np.sum([len(str(group)) for group in groups]) result = dict() result['_grouped_data'] = dict() for response_col in response_cols: data = table[response_col] result['_grouped_data'][response_col] = dict() ax = sns.boxplot(x=factor_col, y=response_col, data=table, order=groups) if sum_len > 512: ax.set_xticklabels(ax.get_xticklabels(), rotation=90) elif sum_len > 64: ax.set_xticklabels(ax.get_xticklabels(), rotation=45) fig_box = plt2MD(plt) plt.clf() model = ols( """Q('{response_col}') ~ C(Q('{factor_col}'))""".format( response_col=response_col, factor_col=factor_col), table).fit() # TODO factor_col = class => error anova = anova_lm(model) anova_df = pandasDF2MD(anova) p_value = anova["""PR(>F)"""][0] residual = model.resid sns.distplot(residual) distplot = plt2MD(plt) plt.clf() sm.qqplot(residual, line='s') qqplot = plt2MD(plt) plt.clf() rb.addMD( strip_margin(""" | ## {response_col} by {factor_col} | {fig_box} | | ### ANOVA | {anova_df} | | ### Diagnostics | {distplot} | | {qqplot} """.format(response_col=response_col, factor_col=factor_col, fig_box=fig_box, anova_df=anova_df, distplot=distplot, qqplot=qqplot))) result['_grouped_data'][response_col]['p_value'] = p_value result['report'] = rb.get() return {'result': result}
def _xgb_regression_train(table, feature_cols, label_col, max_depth=3, learning_rate=0.1, n_estimators=100, silent=True, objectibe='reg:linear', booster='gbtree', n_jobs=1, nthread=None, gamma=0, min_child_weight=1, max_delta_step=0, subsample=1, colsample_bytree=1, colsample_bylevel=1, reg_alpha=0, reg_lambda=1, scale_pos_weight=1, base_score=0.5, random_state=0, seed=None, missing=None, sample_weight=None, eval_set=None, eval_metric=None, early_stopping_rounds=None, verbose=True, xgb_model=None, sample_weight_eval_set=None): regressor = XGBRegressor(max_depth, learning_rate, n_estimators, silent, objectibe, booster, n_jobs, nthread, gamma, min_child_weight, max_delta_step, subsample, colsample_bytree, colsample_bylevel, reg_alpha, reg_lambda, scale_pos_weight, base_score, random_state, seed, missing) regressor.fit(table[feature_cols], table[label_col], sample_weight, eval_set, eval_metric, early_stopping_rounds, verbose, xgb_model, sample_weight_eval_set) # json get_param = regressor.get_params() feature_importance = regressor.feature_importances_ # plt.rcdefaults() plot_importance(regressor) plt.tight_layout() fig_plot_importance = plt2MD(plt) plt.clf() # plt.rcParams['figure.dpi'] = figure_dpi # plot_tree(regressor) # fig_plot_tree_UT = plt2MD(plt) # plt.clf() # plt.rcParams['figure.dpi'] = figure_dpi # plot_tree(regressor, rankdir='LR') # fig_plot_tree_LR = plt2MD(plt) # plt.rcdefaults() # plt.clf() out_model = _model_dict('xgb_regression_model') out_model['feature_cols'] = feature_cols out_model['label_col'] = label_col out_model['parameters'] = get_param out_model['feature_importance'] = feature_importance out_model['regressor'] = regressor out_model['plot_importance'] = fig_plot_importance # out_model['plot_tree_UT'] = fig_plot_tree_UT # out_model['plot_tree_LR'] = fig_plot_tree_LR # out_model['to_graphviz'] = md_to_graphviz # report get_param_list = [] get_param_list.append(['feature_cols', feature_cols]) get_param_list.append(['label_col', label_col]) for key, value in get_param.items(): temp = [key, value] get_param_list.append(temp) get_param_df = pd.DataFrame(data=get_param_list, columns=['parameter', 'value']) feature_importance_df = pd.DataFrame(data=feature_importance, index=feature_cols).T rb = ReportBuilder() rb.addMD( strip_margin(""" | ## XGB Regression Result | | ### Plot Importance | {image_importance} | | ### Feature Importance | {table_feature_importance} | | ### Parameters | {table_parameter} | """.format(image_importance=fig_plot_importance, table_feature_importance=pandasDF2MD(feature_importance_df, 20), table_parameter=pandasDF2MD(get_param_df)))) out_model['report'] = rb.get() return {'model': out_model}
def _decision_tree_classification_train( table, feature_cols, label_col, # fig_size=np.array([6.4, 4.8]), criterion='gini', splitter='best', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features=None, random_state=None, max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, class_weight=None, presort=False, sample_weight=None, check_input=True, X_idx_sorted=None): classifier = DecisionTreeClassifier( criterion, splitter, max_depth, min_samples_split, min_samples_leaf, min_weight_fraction_leaf, max_features, random_state, max_leaf_nodes, min_impurity_decrease, min_impurity_split, class_weight, presort) classifier.fit(table[feature_cols], table[label_col], sample_weight, check_input, X_idx_sorted) try: from sklearn.externals.six import StringIO from sklearn.tree import export_graphviz import pydotplus dot_data = StringIO() export_graphviz(classifier, out_file=dot_data, feature_names=feature_cols, class_names=table[label_col].astype('str').unique(), filled=True, rounded=True, special_characters=True) graph = pydotplus.graph_from_dot_data(dot_data.getvalue()) from brightics.common.report import png2MD fig_tree = png2MD(graph.create_png()) except: fig_tree = "Graphviz is needed to draw a Decision Tree graph. Please download it from http://graphviz.org/download/ and install it to your computer." # json model = _model_dict('decision_tree_classification_model') model['feature_cols'] = feature_cols model['label_col'] = label_col model['classes'] = classifier.classes_ feature_importance = classifier.feature_importances_ model['feature_importance'] = feature_importance model['max_features'] = classifier.max_features_ model['n_classes'] = classifier.n_classes_ model['n_features'] = classifier.n_features_ model['n_outputs'] = classifier.n_outputs_ model['tree'] = classifier.tree_ get_param = classifier.get_params() model['parameters'] = get_param model['classifier'] = classifier # report indices = np.argsort(feature_importance) sorted_feature_cols = np.array(feature_cols)[indices] plt.title('Feature Importances') plt.barh(range(len(indices)), feature_importance[indices], color='b', align='center') for i, v in enumerate(feature_importance[indices]): plt.text(v, i, " {:.2f}".format(v), color='b', va='center', fontweight='bold') plt.yticks(range(len(indices)), sorted_feature_cols) plt.xlabel('Relative Importance') plt.xlim(0, 1.1) plt.tight_layout() fig_feature_importances = plt2MD(plt) plt.clf() params = dict2MD(get_param) feature_importance_df = pd.DataFrame(data=feature_importance, index=feature_cols).T # Add tree plot rb = ReportBuilder() rb.addMD( strip_margin(""" | ## Decision Tree Classification Train Result | ### Decision Tree | {fig_tree} | | ### Feature Importance | {fig_feature_importances} | | ### Parameters | {list_parameters} | """.format(fig_tree=fig_tree, fig_feature_importances=fig_feature_importances, list_parameters=params))) model['report'] = rb.get() return {'model': model}
def paired_ttest(table, first_column, second_column, alternative, hypothesized_difference=0, confidence_level=0.95): df = len(table) - 1 diff_mean = (table[first_column] - table[second_column]).mean() std_dev = np.std(table[first_column] - table[second_column]) t_value = stats.ttest_rel(table[first_column], table[second_column] + hypothesized_difference)[0] p_value_ul = stats.ttest_rel(table[first_column], table[second_column] + hypothesized_difference)[1] p_value_u = stats.t.sf(t_value, df) p_value_l = stats.t.cdf(t_value, df) left_u = diff_mean - std_dev * stats.t.isf((1 - confidence_level), df) / np.sqrt(df) right_l = diff_mean + std_dev * stats.t.isf((1 - confidence_level), df) / np.sqrt(df) left_ul = diff_mean - std_dev * stats.t.isf((1 - confidence_level) / 2, df) / np.sqrt(df) right_ul = diff_mean + std_dev * stats.t.isf((1 - confidence_level) / 2, df) / np.sqrt(df) result_value_u = [{'data' : first_column + " , " + second_column, 'alternative_hypothesis' : "true difference in means > " + str(hypothesized_difference), 'statistics' : "t statistics, t distribution with " + str(df) + " degrees of freedom under the null hypothesis", 'estimates' : t_value, 'p_value' : p_value_u, 'confidence_level' : confidence_level, 'low_confidence_interval' : left_u, 'upper_confidence_interval' : np.Infinity}] result_value_l = [{'data' : first_column + " , " + second_column, 'alternative_hypothesis' : "true difference in means < " + str(hypothesized_difference), 'statistics' : "t statistics, t distribution with " + str(df) + " degrees of freedom under the null hypothesis", 'estimates' : t_value, 'p_value' : p_value_l, 'confidence_level' : confidence_level, 'low_confidence_interval' :-np.Infinity, 'upper_confidence_interval' : right_l}] result_value_ul = [{'data' : first_column + " , " + second_column, 'alternative_hypothesis' : "true difference in means != " + str(hypothesized_difference), 'statistics' : "t statistics, t distribution with " + str(df) + " degrees of freedom under the null hypothesis", 'estimates' : t_value, 'p_value' : p_value_ul, 'confidence_level' : confidence_level, 'low_confidence_interval' : left_ul, 'upper_confidence_interval' : right_ul}] df_result = pd.DataFrame() df_u = pd.DataFrame(result_value_u, columns=['data', 'alternative_hypothesis', 'statistics', 'estimates', 'p_value', 'confidence_level', 'low_confidence_interval', 'upper_confidence_interval']) df_l = pd.DataFrame(result_value_l, columns=['data', 'alternative_hypothesis', 'statistics', 'estimates', 'p_value', 'confidence_level', 'low_confidence_interval', 'upper_confidence_interval']) df_ul = pd.DataFrame(result_value_ul, columns=['data', 'alternative_hypothesis', 'statistics', 'estimates', 'p_value', 'confidence_level', 'low_confidence_interval', 'upper_confidence_interval']) if 'greater' in alternative: df_result = df_result.append(df_u, ignore_index=True) if 'less' in alternative: df_result = df_result.append(df_l, ignore_index=True) if 'twosided' in alternative: df_result = df_result.append(df_ul, ignore_index=True) result_table_ul = pd.DataFrame([{'Alternative': 'Two Sided', 'H1': 'true difference in means != ' + str(hypothesized_difference), 't_value': t_value, 'p_value': p_value_ul, str(confidence_level * 100) + '% confidence interval': '(' + str(left_ul) + ', ' + str(right_ul) + ')'}]) result_table_u = pd.DataFrame([{'Alternative': 'Greater', 'H1': 'true difference in means > ' + str(hypothesized_difference), 't_value': t_value, 'p_value': p_value_u, str(confidence_level * 100) + '% confidence interval': '(' + str(left_u) + ', ' + str(np.Infinity) + ')'}]) result_table_l = pd.DataFrame([{'Alternative': 'Less', 'H1': 'true difference in means < ' + str(hypothesized_difference), 't_value': t_value, 'p_value': p_value_l, str(confidence_level * 100) + '% confidence interval': '(' + str(-np.Infinity) + ', ' + str(right_l) + ')'}]) result_table = pd.DataFrame() if 'greater' in alternative: result_table = result_table.append(result_table_u, ignore_index=True) if 'less' in alternative: result_table = result_table.append(result_table_l, ignore_index=True) if 'twosided' in alternative: result_table = result_table.append(result_table_ul, ignore_index=True) ordered_result_table = pd.DataFrame(result_table, columns=['Alternative', 'H1', 't_value', 'p_value', str(confidence_level * 100) + '% confidence interval']) rb = ReportBuilder() rb.addMD(strip_margin(""" |## Paired T Test Result |##### df : {deg_f} |##### Mean of differences : {dm} |##### Standard deviation : {sd} | |{result_table} | """.format(deg_f=df, dm=diff_mean, sd=std_dev, result_table=pandasDF2MD(ordered_result_table)))) model = dict() model['report'] = rb.get() return{'out_table':df_result, 'model':model}
def _evaluate_classification(table, label_col, prediction_col): label = table[label_col] predict = table[prediction_col] # compute metrics accuracy = accuracy_score(label, predict) f1 = f1_score(label, predict, average="weighted") precision = precision_score(label, predict, average="weighted") recall = recall_score(label, predict, average="weighted") class_names = np.unique(np.union1d(label.values, predict.values)) # Plot non-normalized confusion matrix plt.figure() _plot_confusion_matrix(label, predict, classes=class_names, title='Confusion matrix, without normalization') fig_cnf_matrix = plt2MD(plt) # Plot normalized confusion matrix plt.figure() _plot_confusion_matrix(label, predict, classes=class_names, normalize=True, title='Normalized confusion matrix') fig_cnf_matrix_normalized = plt2MD(plt) plt.clf() # json summary = dict() summary['label_col'] = label_col summary['prediction_col'] = prediction_col summary['f1_score'] = f1 summary['accuracy_score'] = accuracy summary['precision_score'] = precision summary['recall_score'] = recall # report all_dict_list = [{ 'f1': f1, 'accuracy': accuracy, 'precision': precision, 'recall': recall }] all_df = pd.DataFrame(all_dict_list) all_df = all_df[['f1', 'accuracy', 'precision', 'recall']] summary['metrics'] = all_df rb = ReportBuilder() rb.addMD( strip_margin(""" | ## Evaluate Classification Result | ### Metrics | {table1} | | ### Confusion matrix | {fig_confusion_matrix} | | {fig_confusion_matrix_normalized} | """.format(table1=pandasDF2MD(all_df), fig_confusion_matrix=fig_cnf_matrix, fig_confusion_matrix_normalized=fig_cnf_matrix_normalized))) summary['report'] = rb.get() return {'result': summary}
def _glm_train(table, feature_cols, label_col, family="Gaussian", link="ident", fit_intercept=True): features = table[feature_cols] label = table[label_col] if label_col in feature_cols: raise Exception("%s is duplicated." % label_col) if family == "Gaussian": sm_family = sm.families.Gaussian() elif family == "inv_Gaussian": sm_family = sm.families.InverseGaussian() elif family == "binomial": sm_family = sm.families.Binomial() elif family == "Poisson": sm_family = sm.families.Poisson() elif family == "neg_binomial": sm_family = sm.families.NegativeBinomial() elif family == "gamma": sm_family = sm.families.Gamma() elif family == "Tweedie": sm_family = sm.families.Tweedie() if link == "ident": sm_link = sm.families.links.identity elif link == "log": sm_link = sm.families.links.log elif link == "logit": sm_link = sm.families.links.logit elif link == "probit": sm_link = sm.families.links.probit elif link == "cloglog": sm_link = sm.families.links.cLogLog elif link == "pow": sm_link = sm.families.links.Power elif link == "nbinom": sm_link = sm.families.links.binom if fit_intercept == True: glm_model = sm.GLM(label, sm.add_constant(features), family=sm_family, link=sm_link).fit() else: glm_model = sm.GLM(label, features, family=sm_family, link=sm_link).fit() summary = glm_model.summary().as_html() rb = ReportBuilder() rb.addMD(strip_margin(""" | ## GLM Result | ### Summary | """)) rb.addHTML(summary) model = _model_dict('glm_model') model['features'] = feature_cols model['label'] = label_col model['family'] = family model['link'] = link model['coefficients'] = glm_model.params model['aic'] = glm_model.aic model['bic'] = glm_model.bic model['tvalues'] = glm_model.tvalues model['pvalues'] = glm_model.pvalues model['fit_intercept'] = fit_intercept model['glm_model'] = glm_model model['report'] = rb.get() return {'model' : model}
def _function_by_group(function, table=None, model=None, group_by=None, **params): if table is not None: if isinstance( table, pd.DataFrame) and group_by is not None: # table and group_by table, group_keys = _group(table, group_by) elif isinstance(table, dict) and '_grouped_data' in table: # grouped_data group_keys = [*table['_grouped_data']] group_by = table['_group_by'] elif isinstance(table, pd.DataFrame) and group_by is None: raise Exception('This function requires group_by') else: raise Exception('Unknown type.') elif model is not None: if isinstance(model, dict) and '_grouped_data' in model: group_keys = [*model['_grouped_data']] group_by = model['_group_by'] else: raise Exception( 'This function requires a table or a model as an input.') sample_group = group_keys[0] if table is not None and model is None: sample_result = function(table=table['_grouped_data'][sample_group], **params) elif table is not None and model is not None: sample_result = function(table=table['_grouped_data'][sample_group], model=model['_grouped_data'][sample_group], **params) else: sample_result = function(model=model['_grouped_data'][sample_group], **params) res_keys = [*sample_result] df_keys = [ k for k, v in sample_result.items() if isinstance(v, pd.DataFrame) ] model_keys_containing_repr = [ k for k, v in sample_result.items() if isinstance(v, dict) and 'report' in v ] res_dict = dict() for res_key in res_keys: res_dict[res_key] = {'_grouped_data': dict(), '_group_by': group_by} for group in group_keys: if table is not None and model is None: res_group = function(table=table['_grouped_data'][group], **params) elif table is not None and model is not None: res_group = function(table=table['_grouped_data'][group], model=model['_grouped_data'][group], **params) else: res_group = function(model=model['_grouped_data'][group], **params) for res_key in res_keys: res_dict[res_key]['_grouped_data'][group] = res_group[res_key] for repr_key in model_keys_containing_repr: rb = ReportBuilder() for group in group_keys: rb.addMD('{group}'.format(group=group)) rb.merge(res_dict[repr_key]['_grouped_data'][group]['report']) res_dict[repr_key]['report'] = rb.get() for df_key in df_keys: res_dict[df_key] = _flatten(res_dict[df_key]) return res_dict
def naive_bayes_train(table, feature_cols, label_col, alpha=1.0, fit_prior=True, class_prior=None): features = table[feature_cols] label = table[label_col] label_encoder = preprocessing.LabelEncoder() label_encoder.fit(label) label_correspond = label_encoder.transform(label) if class_prior is not None: tmp_class_prior = [0 for x in range(len(class_prior))] for elems in class_prior: tmp = elems.split(":") tmp_class_prior[label_encoder.transform([tmp[0] ])[0]] = float(tmp[1]) class_prior = tmp_class_prior nb_model = MultinomialNB(alpha, fit_prior, class_prior) nb_model.fit(features, label_correspond) prediction_correspond = nb_model.predict(features) get_param = dict() get_param['Lambda'] = alpha # get_param['Prior Probabilities of the Classes'] = class_prior # get_param['Fit Class Prior Probability'] = fit_prior get_param['Feature Columns'] = feature_cols get_param['Label Column'] = label_col cnf_matrix = confusion_matrix(label_correspond, prediction_correspond) plt.figure() plot_confusion_matrix(cnf_matrix, classes=label_encoder.classes_, title='Confusion Matrix') fig_confusion_matrix = plt2MD(plt) accuracy = nb_model.score(features, label_correspond) * 100 rb = ReportBuilder() rb.addMD( strip_margin(""" | ## Naive Bayes Classification Result | ### Parameters | {table_parameter} | ### Predicted vs Actual | {image1} | #### Accuacy = {accuracy}% | """.format(image1=fig_confusion_matrix, accuracy=accuracy, table_parameter=dict2MD(get_param)))) model = _model_dict('naive_bayes_model') model['features'] = feature_cols model['label_col'] = label_col model['label_encoder'] = label_encoder model['nb_model'] = nb_model model['report'] = rb.get() return {'model': model}
def _chi_square_test_of_independence(table, response_cols, factor_col, correction=False): label_list = [] feature_list = [] alternative_hypothesis_list = [] dof_list = [] stat_chi_list = [] p_chi_list = [] for response_col in response_cols: response = table[response_col] contingency_table = pd.crosstab(table[response_col], table[factor_col], margins=True) response_index = len(contingency_table) - 1 factor_index = len(contingency_table.columns) - 1 temporary = contingency_table.iloc[0:response_index, 0:factor_index] f_object = np.array(temporary) test = stats.chi2_contingency(f_object, correction, 1)[0:3] label = '{factor_col}'.format(factor_col=factor_col) feature = '{response_col}'.format(response_col=response_col) if test[1] < 0.05: dependence = 'Reject the null hypothesis that two categorical variables are independent at 5% significance level.' elif test[1] >= 0.05: dependence = 'No association was found between two categorical variables at 5% significance level.' elif math.isnan(test[1]): dependence = 'Independence of two categorical variables cannot be decided.' conclusion = '{dependence}'.format(dependence=dependence) alternative_hypothesis = 'Two categorical variables are dependent.' dof = 'chi-square distribution with {dof} degrees of freedom'.format( dof=test[2]) stat_chi = '{stat_chi}'.format(stat_chi=test[0]) p_chi = '{p_chi}'.format(p_chi=test[1]) label_list.append(label) feature_list.append(feature) alternative_hypothesis_list.append(alternative_hypothesis) dof_list.append(dof) stat_chi_list.append(stat_chi) p_chi_list.append(p_chi) result_table = pd.DataFrame.from_items( [['label', label_list], ['feature', feature_list], ['alternative_hypothesis', alternative_hypothesis_list], ['df', dof_list], ['estimate', stat_chi_list], ['p_value', p_chi_list]]) result = dict() result['result_table'] = result_table rb = ReportBuilder() rb.addMD( strip_margin(""" | ## Chi-square Test of Independence Result | - H0: the two categorical variables are independent. | - H1: the two categorical variables are dependent. """)) for response_col in response_cols: response = table[response_col] contingency_table = pd.crosstab(table[response_col], table[factor_col], margins=True) response_index = len(contingency_table) - 1 factor_index = len(contingency_table.columns) - 1 temporary = contingency_table.iloc[0:response_index, 0:factor_index] f_object = np.array(temporary) test = stats.chi2_contingency(f_object, correction, 1)[0:3] label = '{factor_col}'.format(factor_col=factor_col) feature = '{response_col}'.format(response_col=response_col) if test[1] < 0.05: dependence = 'Reject the null hypothesis that two categorical variables are independent at 5% significance level.' elif test[1] >= 0.05: dependence = 'No association was found between two categorical variables at 5% significance level.' elif math.isnan(test[1]): dependence = 'Independence of two categorical variables cannot be decided.' dof_simplelist = [] stat_chi_simplelist = [] p_chi_simplelist = [] dof = '{dof}'.format(dof=test[2]) stat_chi = '{stat_chi}'.format(stat_chi=test[0]) p_chi = '{p_chi}'.format(p_chi=test[1]) stat_chi_simplelist.append(stat_chi) dof_simplelist.append(dof) p_chi_simplelist.append(p_chi) result_table_simple = pd.DataFrame.from_items( [['estimate', stat_chi_simplelist], ['df', dof_simplelist], ['p_value', p_chi_simplelist]]) # test statistic = {test_statistic}, df = {dof}, p_value = {p_value} # test_statistic = stats.chi2_contingency(f_object,correction,lambda_)[0], dof=stats.chi2_contingency(f_object,correction,lambda_)[2], p_value=stats.chi2_contingency(f_object,correction,lambda_)[1] rb.addMD( strip_margin(""" |### Label: {label}, Feature: {feature} | |{result_table_simple} | |{dependence} | | """.format(label=factor_col, feature=response_col, result_table_simple=pandasDF2MD(result_table_simple), dependence=dependence))) model = _model_dict('Chi-square test of independence') model['report'] = rb.get() result_table = result_table.copy() return {'model': model}
def _pca(table, input_cols, new_column_name='projected_', n_components=None, copy=True, whiten=False, svd_solver='auto', tol=0.0, iterated_power='auto', random_state=None, hue=None, alpha=0, key_col=None): num_feature_cols = len(input_cols) if n_components is None: n_components = num_feature_cols pca = PCA(None, copy, whiten, svd_solver, tol, iterated_power, random_state) pca_model = pca.fit(table[input_cols]) column_names = [] for i in range(0, n_components): column_names.append(new_column_name + str(i)) # print(column_names) pca_result = pca_model.transform(table[input_cols]) out_df = pd.DataFrame(data=pca_result[:, :n_components], columns=[column_names]) out_df = pd.concat([table.reset_index(drop=True), out_df], axis=1) out_df.columns = table.columns.values.tolist() + column_names res_components = pca_model.components_ res_components_df = pd.DataFrame(data=res_components[:n_components], columns=[input_cols]) res_explained_variance = pca_model.explained_variance_ res_explained_variance_ratio = pca_model.explained_variance_ratio_ res_singular_values = pca_model.singular_values_ res_mean = pca_model.mean_ res_n_components = pca_model.n_components_ res_noise_variance = pca_model.noise_variance_ res_get_param = pca_model.get_params() res_get_covariance = pca_model.get_covariance() res_get_precision = pca_model.get_precision() # visualization plt.figure() if n_components == 1: sns.scatterplot(column_names[0], column_names[0], hue=hue, data=out_df) plt_two = plt2MD(plt) plt.clf() else: plt_two = _biplot( 0, 1, pc_columns=column_names, columns=input_cols, singular_values=res_singular_values, components=res_components, explained_variance_ratio=res_explained_variance_ratio, alpha=alpha, hue=hue, data=out_df, ax=plt.gca(), key_col=key_col) plt.figure() fig_scree = _screeplot(res_explained_variance, res_explained_variance_ratio, n_components) table_explained_variance = pd.DataFrame(res_explained_variance, columns=['explained_variance']) table_explained_variance[ 'explained_variance_ratio'] = res_explained_variance_ratio table_explained_variance[ 'cum_explained_variance_ratio'] = res_explained_variance_ratio.cumsum( ) rb = ReportBuilder() rb.addMD( strip_margin(""" | ## PCA Result | ### Plot | {image1} | | ### Explained Variance | {fig_scree} | {table_explained_variance} | | ### Components | {table2} | | ### Parameters | {parameter1} """.format(image1=plt_two, fig_scree=fig_scree, table_explained_variance=pandasDF2MD(table_explained_variance), parameter1=dict2MD(res_get_param), table2=pandasDF2MD(res_components_df)))) model = _model_dict('pca') model['components'] = res_components model['explained_variance'] = res_explained_variance model['explained_variance_ratio'] = res_explained_variance_ratio model['singular_values'] = res_singular_values model['mean'] = res_mean model['n_components'] = res_n_components model['noise_variance'] = res_noise_variance model['parameters'] = res_get_param model['covariance'] = res_get_covariance model['precision'] = res_get_precision model['report'] = rb.get() model['pca_model'] = pca_model model['input_cols'] = input_cols return {'out_table': out_df, 'model': model}
def _correlation(table, vars, method='pearson', height=2.5, corr_prec=2): size = len(vars) s_default = plt.rcParams['lines.markersize']**2. scatter_kws = {"s": s_default * height / 6.4} result_arr = [] for i in range(size): for j in range(i): if method == 'pearson': r, p = stats.pearsonr(table[vars[i]], table[vars[j]]) elif method == 'spearman': r, p = stats.spearmanr(table[vars[i]], table[vars[j]]) elif method == 'kendal': r, p = stats.kendalltau(table[vars[i]], table[vars[j]]) result_arr.append([vars[i], vars[j], r, p]) df_result = pd.DataFrame(result_arr, columns=['x', 'y', 'corr', 'p_value']) def corr(x, y, **kwargs): if kwargs['method'] == 'pearson': r, p = stats.pearsonr(x, y) elif kwargs['method'] == 'spearman': r, p = stats.spearmanr(x, y) elif kwargs['method'] == 'kendal': r, p = stats.kendalltau(x, y) p_stars = '' if p <= 0.05: p_stars = '*' if p <= 0.01: p_stars = '**' if p <= 0.001: p_stars = '***' corr_text = '{:.{prec}f}'.format(r, prec=corr_prec) font_size = abs(r) * 15 * 2 / corr_prec + 5 ax = plt.gca() ax.annotate(corr_text, [ .5, .5, ], xycoords="axes fraction", ha='center', va='center', fontsize=font_size * height) ax.annotate(p_stars, xy=(0.65, 0.6), xycoords=ax.transAxes, color='red', fontsize=17 * height) g = sns.PairGrid(table, vars=vars, height=height) g.map_diag(sns.distplot) if method == 'pearson': g.map_lower(sns.regplot, scatter_kws=scatter_kws) else: g.map_lower(sns.regplot, lowess=True, scatter_kws=scatter_kws) g.map_upper(corr, method=method) fig_corr = plt2MD(plt) plt.clf() rb = ReportBuilder() rb.addMD( strip_margin(""" ## Correlation Results | ### Correlation Matrix | {fig_corr} | | ### Correlation Table | {table} """.format(fig_corr=fig_corr, table=pandasDF2MD(df_result)))) params = {'vars': vars, 'method': method, 'height': height} res = dict() res['params'] = params res['corr_table'] = df_result res['report'] = rb.get() return {'result': res}
def _linear_regression_train(table, feature_cols, label_col, fit_intercept=True): features = table[feature_cols] label = table[label_col] lr_model = LinearRegression(fit_intercept) lr_model.fit(features, label) predict = lr_model.predict(features) residual = label - predict if fit_intercept == True: lr_model_fit = sm.OLS(label, sm.add_constant(features)).fit() else: lr_model_fit = sm.OLS(label, features).fit() summary = lr_model_fit.summary() summary_tables = simple_tables2df_list(summary.tables) summary0 = summary_tables[0] summary1 = summary_tables[1] summary2 = summary_tables[2] html_result = summary.as_html() plt.figure() plt.scatter(predict, label) plt.xlabel('Predicted values for ' + label_col) plt.ylabel('Actual values for ' + label_col) x = predict y = np.array(label) a = x.size b = np.sum(x) c = b d = 0 for i in x: d += +i * i e = np.sum(y) f = 0 for i in range(0, x.size - 1): f += x[i] * y[i] det = a * d - b * c aa = (d * e - b * f) / det bb = (a * f - c * e) / det p1x = np.min(x) p1y = aa + bb * p1x p2x = np.max(x) p2y = aa + bb * p2x plt.plot([p1x, p2x], [p1y, p2y], 'r--') fig_actual_predict = plt2MD(plt) plt.figure() plt.scatter(predict, residual) plt.xlabel('Predicted values for ' + label_col) plt.ylabel('Residuals') plt.axhline(y=0, color='r', linestyle='--') fig_residual_1 = plt2MD(plt) plt.figure() sm.qqplot(residual, line='s') plt.ylabel('Residuals') fig_residual_2 = plt2MD(plt) plt.figure() sns.distplot(residual) plt.xlabel('Residuals') fig_residual_3 = plt2MD(plt) rb = ReportBuilder() rb.addMD(strip_margin(""" | ## Linear Regression Result | ### Summary | """)) rb.addHTML(html_result) rb.addMD(strip_margin(""" | | ### Predicted vs Actual | {image1} | | ### Fit Diagnostics | {image2} | {image3} | {image4} """.format(image1=fig_actual_predict, image2=fig_residual_1, image3=fig_residual_2, image4=fig_residual_3 ))) model = _model_dict('linear_regression_model') model['features'] = feature_cols model['label'] = label_col model['coefficients'] = lr_model_fit.params model['r2'] = lr_model_fit.rsquared model['adjusted_r2'] = lr_model_fit.rsquared_adj model['aic'] = lr_model_fit.aic model['bic'] = lr_model_fit.bic model['f_static'] = lr_model_fit.fvalue model['tvalues'] = lr_model_fit.tvalues model['pvalues'] = lr_model_fit.pvalues model['lr_model'] = lr_model model['report'] = rb.get() model['summary0'] = summary0 model['summary1'] = summary1 model['summary2'] = summary2 return {'model' : model}
def _kmeans_silhouette_train_predict(table, input_cols, n_clusters_list=range(2, 10), prediction_col='prediction', init='k-means++', n_init=10, max_iter=300, tol=1e-4, precompute_distances='auto', seed=None, n_jobs=1, algorithm='auto', n_samples=None): if n_samples is None: n_samples = len(table) inputarr = table[input_cols] pca2_model = PCA(n_components=2).fit(inputarr) pca2 = pca2_model.transform(inputarr) silhouette_list = [] silouette_samples_list = [] models = [] centers_list = [] images = [] for k in n_clusters_list: k_means = SKKMeans(n_clusters=k, init=init, n_init=n_init, max_iter=max_iter, tol=tol, precompute_distances=precompute_distances, verbose=0, random_state=seed, copy_x=True, n_jobs=n_jobs, algorithm=algorithm) k_means.fit(inputarr) models.append(k_means) predict = k_means.labels_ centersk = k_means.cluster_centers_ centers_list.append(centersk) score = silhouette_score(inputarr, predict) silhouette_list.append(score) samples = silhouette_samples(inputarr, predict) silouette_samples_list.append(samples) pca2_centers = pca2_model.transform(centersk) _, (ax1, ax2) = plt.subplots(1, 2) colors = cm.nipy_spectral(np.arange(k).astype(float) / k) y_lower = 0 for i, color in zip(range(k), colors): si = samples[predict == i] si.sort() sizei = si.shape[0] y_upper = y_lower + sizei ax1.fill_betweenx(np.arange(y_lower, y_upper), 0, si, facecolor=color, edgecolor=color, alpha=0.7) y_lower = y_upper ax2.scatter(pca2[:, 0][predict == i], pca2[:, 1][predict == i], color=color) ax1.axvline(x=score, color="red") ax2.scatter(pca2_centers[:, 0], pca2_centers[:, 1], marker='x', edgecolors=1, s=200, color=colors) imagek = plt2MD(plt) plt.clf() images.append(imagek) argmax = np.argmax(silhouette_list) best_k = n_clusters_list[argmax] best_model = models[argmax] predict = best_model.predict(inputarr) best_centers = best_model.cluster_centers_ best_labels = best_model.labels_ fig_centers = _kmeans_centers_plot(input_cols, best_centers) fig_samples = _kmeans_samples_plot(table, input_cols, n_samples, best_centers) fig_pca = _kmeans_pca_plot(predict, best_centers, pca2_model, pca2) x_clusters = range(len(n_clusters_list)) plt.xticks(x_clusters, n_clusters_list) plt.plot(x_clusters, silhouette_list, '.-') fig_silhouette = plt2MD(plt) plt.clf() rb = ReportBuilder() rb.addMD( strip_margin(""" | ## Kmeans Silhouette Result | - silloutte metrics: | {fig_silhouette} | - best K: {best_k} | - best centers: | {fig_pca} | {fig_centers} | {fig_samples} | """.format(fig_silhouette=fig_silhouette, best_k=best_k, fig_pca=fig_pca, fig_centers=fig_centers, fig_samples=fig_samples))) for k, image in zip(n_clusters_list, images): rb.addMD( strip_margin(""" | ### k = {k} | {image} | """.format(k=k, image=image))) model = _model_dict('kmeans_silhouette') model['best_k'] = best_k model['best_centers'] = best_centers model['best_model'] = best_model model['input_cols'] = input_cols model['report'] = rb.get() out_table = table.copy() out_table[prediction_col] = predict return {'out_table': out_table, 'model': model}