def tfidf(table, group_by=None, **params): # This will be deprecated. check_required_parameters(_tfidf, params, ['table']) params = get_default_from_parameters_if_required(params, _tfidf) param_validation_check = [greater_than_or_equal_to(params, 0, 'min_df'), greater_than_or_equal_to(params, 2, 'num_voca'), greater_than(params, 0, 'max_df')] validate(*param_validation_check) if group_by is not None: return _function_by_group(_tfidf, table, group_by=group_by, **params) else: return _tfidf(table, **params)
def mean_shift(table, group_by=None, **params): check_required_parameters(_mean_shift, params, ['table']) params = get_default_from_parameters_if_required(params, _mean_shift) param_validation_check = [greater_than(params, 0.0, 'bandwidth')] validate(*param_validation_check) if group_by is not None: grouped_model = _function_by_group(_mean_shift, table, group_by=group_by, **params) return grouped_model else: return _mean_shift(table, **params)
def _split_data(table, train_ratio=7.0, test_ratio=3.0, random_state=None, shuffle=True, stratify=None): validate(greater_than(train_ratio, 0.0, 'train_ratio'), greater_than(test_ratio, 0.0, 'test_ratio')) ratio = test_ratio / (train_ratio + test_ratio) out_table_train, out_table_test = sktrain_test_split( table, test_size=ratio, random_state=random_state, shuffle=shuffle, stratify=stratify) return { 'train_table': out_table_train.reset_index(drop=True), 'test_table': out_table_test.reset_index(drop=True) }
def profile_table(table, group_by=None, **params): check_required_parameters(_profile_table, params, ['table']) params = get_default_from_parameters_if_required(params, _profile_table) param_validation_check = [greater_than_or_equal_to(params, 1, 'bins'), greater_than(params, 0.0, 'correlation_threshold')] validate(*param_validation_check) if group_by is not None: return _function_by_group(_profile_table, table, group_by=group_by, **params) else: return _profile_table(table, **params)
def _pairplot(table, x_vars, y_vars=None, kind='scatter', diag_kind='auto', markers=None, palette=None, height=2.5, aspect=1, dropna=True, hue=None): validate(greater_than(height, 0, 'height'), greater_than(aspect, 0, 'aspect')) s_default = plt.rcParams['lines.markersize']**2. plot_kws = {"s": s_default * height / 6.4} if y_vars is None: y_vars = x_vars if kind == 'scatter': g = sns.pairplot(table, x_vars=x_vars, y_vars=y_vars, kind=kind, diag_kind=diag_kind, markers=markers, height=height, aspect=aspect, \ dropna=dropna, hue=hue, palette=palette, plot_kws=plot_kws) else: scatter_kws = {'scatter_kws': plot_kws} g = sns.pairplot(table, x_vars=x_vars, y_vars=y_vars, kind=kind, diag_kind=diag_kind, markers=markers, height=height, aspect=aspect, \ dropna=dropna, hue=hue, palette=palette, plot_kws=scatter_kws) if height <= 2.5: for ax in g.axes.flatten(): for label in ax.get_xticklabels(): label.set_rotation(90 * (2.5 - height)) rb = BrtcReprBuilder() rb.addPlt(plt) plt.clf() return {'result': {'_repr_brtc_': rb.get()}}
def bow(table, group_by=None, **params): check_required_parameters(_bow, params, ['table']) params = get_default_from_parameters_if_required(params, _bow) param_validation_check = [ greater_than_or_equal_to(params, 0, 'no_below'), less_than_or_equal_to(params, 1.0, 'no_above'), greater_than(params, 0.0, 'no_above'), greater_than_or_equal_to(params, 1, 'keep_n') ] validate(*param_validation_check) if group_by is not None: return _function_by_group(_bow, table, group_by=group_by, **params) else: return _bow(table, **params)
def naive_bayes_train(table, group_by=None, **params): params = get_default_from_parameters_if_required(params, _naive_bayes_train) param_validation_check = [greater_than(params, 0, 'alpha')] validate(*param_validation_check) check_required_parameters(_naive_bayes_train, params, ['table']) if group_by is not None: return _function_by_group(_naive_bayes_train, table, group_by=group_by, **params) else: return _naive_bayes_train(table, **params)
def lda(table, group_by=None, **params): check_required_parameters(_lda, params, ['table']) params = get_default_from_parameters_if_required(params, _lda) param_validation_check = [greater_than_or_equal_to(params, 2, 'num_voca'), greater_than_or_equal_to(params, 2, 'num_topic'), from_to(params, 2, params['num_voca'], 'num_topic_word'), greater_than_or_equal_to(params, 1, 'max_iter'), greater_than(params, 1.0, 'learning_offset')] validate(*param_validation_check) if group_by is not None: return _function_by_group(_lda, table, group_by=group_by, **params) else: return _lda(table, **params)
def correlation(table, group_by=None, **params): check_required_parameters(_correlation, params, ['table']) params = get_default_from_parameters_if_required(params, _correlation) param_validation_check = [ greater_than(params, 0, 'height'), greater_than_or_equal_to(params, 1, 'corr_prec') ] validate(*param_validation_check) if group_by is not None: return _function_by_group(_correlation, table, group_by=group_by, **params) else: return _correlation(table, **params)
def outlier_detection_tukey_carling(table, group_by=None, **params): check_required_parameters(_outlier_detection_tukey_carling, params, ['table']) params = get_default_from_parameters_if_required( params, _outlier_detection_tukey_carling) param_validation_check = [greater_than(params, 0.0, 'multiplier')] validate(*param_validation_check) if group_by is not None: return _function_by_group(_outlier_detection_tukey_carling, table, group_by=group_by, **params) else: return _outlier_detection_tukey_carling(table, **params)
def kmeans_silhouette_train_predict(table, group_by=None, **params): check_required_parameters(_kmeans_silhouette_train_predict, params, ['table']) params = get_default_from_parameters_if_required(params, _kmeans_silhouette_train_predict) param_validation_check = [all_elements_greater_than(params, 1, 'n_clusters_list'), greater_than_or_equal_to(params, 1, 'n_init'), greater_than_or_equal_to(params, 1, 'max_iter'), greater_than(params, 0.0, 'tol'), greater_than_or_equal_to(params, 1, 'n_jobs'), greater_than_or_equal_to(params, 0, 'n_samples')] validate(*param_validation_check) if group_by is not None: grouped_model = _function_by_group(_kmeans_silhouette_train_predict, table, group_by=group_by, **params) return grouped_model else: return _kmeans_silhouette_train_predict(table, **params)
def decision_tree_regression_train(table, group_by=None, **params): check_required_parameters(_decision_tree_regression_train, params, ['table']) params = get_default_from_parameters_if_required(params, _decision_tree_regression_train) param_validation_check = [greater_than_or_equal_to(params, 2, 'min_samples_split'), greater_than_or_equal_to(params, 1, 'min_samples_leaf'), greater_than_or_equal_to(params, 0.0, 'min_weight_fraction_leaf'), greater_than_or_equal_to(params, 1, 'max_depth'), greater_than_or_equal_to(params, 1, 'max_features'), greater_than(params, 1, 'max_leaf_nodes'), greater_than_or_equal_to(params, 0.0, 'min_impurity_split')] validate(*param_validation_check) if group_by is not None: grouped_model = _function_by_group(_decision_tree_regression_train, table, group_by=group_by, **params) return grouped_model else: return _decision_tree_regression_train(table, **params)
def ada_boost_regression_train(table, group_by=None, **params): check_required_parameters(_ada_boost_regression_train, params, ['table']) params = get_default_from_parameters_if_required( params, _ada_boost_regression_train) param_validation_check = [ greater_than_or_equal_to(params, 2, 'max_depth'), greater_than_or_equal_to(params, 1, 'n_estimators'), greater_than(params, 0, 'learning_rate') ] validate(*param_validation_check) if group_by is not None: return _function_by_group(_ada_boost_regression_train, table, group_by=group_by, **params) else: return _ada_boost_regression_train(table, **params)
def svm_classification_train(table, group_by=None, **params): check_required_parameters(_svm_classification_train, params, ['table']) params = get_default_from_parameters_if_required( params, _svm_classification_train) param_validation_check = [ over_to(params, 0.0, 1.0, 'c'), greater_than_or_equal_to(params, 0, 'degree'), greater_than(params, 0.0, 'tol') ] validate(*param_validation_check) if group_by is not None: grouped_model = _function_by_group(_svm_classification_train, table, group_by=group_by, **params) return grouped_model else: return _svm_classification_train(table, **params)
def penalized_linear_regression_train(table, group_by=None, **params): check_required_parameters(_penalized_linear_regression_train, params, ['table']) params = get_default_from_parameters_if_required( params, _penalized_linear_regression_train) param_validation_check = [ greater_than_or_equal_to(params, 0.0, 'alpha'), from_to(params, 0.0, 1.0, 'l1_ratio'), greater_than_or_equal_to(params, 1, 'max_iter'), greater_than(params, 0.0, 'tol') ] validate(*param_validation_check) if group_by is not None: grouped_model = _function_by_group(_penalized_linear_regression_train, table, group_by=group_by, **params) return grouped_model else: return _penalized_linear_regression_train(table, **params)
def _profile_table(table, bins=10, check_correlation=False, correlation_threshold=0.9, correlation_overrides=None): validate(greater_than_or_equal_to(bins, 1, 'bins'), greater_than(correlation_threshold, 0.0, 'correlation_threshold')) rb = BrtcReprBuilder() profile = pd_profiling.ProfileReport( table, bins=bins, check_correlation=check_correlation, correlation_threshold=correlation_threshold, correlation_overrides=correlation_overrides) rb.addHTML(profile.html) summary = dict() summary['_repr_brtc_'] = rb.get() return {'result': summary}
def _kmeans_silhouette_train_predict(table, input_cols, n_clusters_list=range(2, 10), prediction_col='prediction', init='k-means++', n_init=10, max_iter=300, tol=1e-4, precompute_distances='auto', seed=None, n_jobs=1, algorithm='auto', n_samples=None): if n_samples is None: n_samples = len(table) inputarr = table[input_cols] validate(all_elements_greater_than(n_clusters_list, 1, 'n_clusters_list'), greater_than_or_equal_to(n_init, 1, 'n_init'), greater_than_or_equal_to(max_iter, 1, 'max_iter'), greater_than(tol, 0.0, 'tol'), greater_than_or_equal_to(n_jobs, 1, 'n_jobs'), greater_than_or_equal_to(n_samples, 0, 'n_samples')) pca2_model = PCA(n_components=2).fit(inputarr) pca2 = pca2_model.transform(inputarr) silhouette_list = [] silouette_samples_list = [] models = [] centers_list = [] images = [] for k in n_clusters_list: k_means = SKKMeans(n_clusters=k, init=init, n_init=n_init, max_iter=max_iter, tol=tol, precompute_distances=precompute_distances, verbose=0, random_state=seed, copy_x=True, n_jobs=n_jobs, algorithm=algorithm) k_means.fit(inputarr) models.append(k_means) predict = k_means.labels_ centersk = k_means.cluster_centers_ centers_list.append(centersk) score = silhouette_score(inputarr, predict) silhouette_list.append(score) samples = silhouette_samples(inputarr, predict) silouette_samples_list.append(samples) pca2_centers = pca2_model.transform(centersk) _, (ax1, ax2) = plt.subplots(1, 2) colors = cm.nipy_spectral(np.arange(k).astype(float) / k) y_lower = 0 for i, color in zip(range(k), colors): si = samples[predict == i] si.sort() sizei = si.shape[0] y_upper = y_lower + sizei ax1.fill_betweenx(np.arange(y_lower, y_upper), 0, si, facecolor=color, edgecolor=color, alpha=0.7) y_lower = y_upper ax2.scatter(pca2[:, 0][predict == i], pca2[:, 1][predict == i], color=color) ax1.axvline(x=score, color="red") ax2.scatter(pca2_centers[:, 0], pca2_centers[:, 1], marker='x', edgecolors=1, s=200, color=colors) imagek = plt2MD(plt) plt.clf() images.append(imagek) argmax = np.argmax(silhouette_list) best_k = n_clusters_list[argmax] best_model = models[argmax] predict = best_model.predict(inputarr) best_centers = best_model.cluster_centers_ best_labels = best_model.labels_ fig_centers = _kmeans_centers_plot(input_cols, best_centers) fig_samples = _kmeans_samples_plot(table, input_cols, n_samples, best_centers) fig_pca = _kmeans_pca_plot(predict, best_centers, pca2_model, pca2) x_clusters = range(len(n_clusters_list)) plt.xticks(x_clusters, n_clusters_list) plt.plot(x_clusters, silhouette_list, '.-') fig_silhouette = plt2MD(plt) plt.clf() rb = BrtcReprBuilder() rb.addMD( strip_margin(""" | ## Kmeans Silhouette Result | - silloutte metrics: | {fig_silhouette} | - best K: {best_k} | - best centers: | {fig_pca} | {fig_centers} | {fig_samples} | """.format(fig_silhouette=fig_silhouette, best_k=best_k, fig_pca=fig_pca, fig_centers=fig_centers, fig_samples=fig_samples))) for k, image in zip(n_clusters_list, images): rb.addMD( strip_margin(""" | ### k = {k} | {image} | """.format(k=k, image=image))) model = _model_dict('kmeans_silhouette') model['best_k'] = best_k model['best_centers'] = best_centers model['best_model'] = best_model model['input_cols'] = input_cols model['_repr_brtc_'] = rb.get() out_table = table.copy() out_table[prediction_col] = predict return {'out_table': out_table, 'model': model}
def _correlation(table, vars, method='pearson', height=2.5, corr_prec=2): validate(greater_than(height, 0, 'height'), greater_than_or_equal_to(corr_prec, 1, 'corr_prec')) size = len(vars) s_default = plt.rcParams['lines.markersize']**2. scatter_kws = {"s": s_default * height / 6.4} result_arr = [] for i in range(size): for j in range(i): if method == 'pearson': r, p = stats.pearsonr(table[vars[i]], table[vars[j]]) elif method == 'spearman': r, p = stats.spearmanr(table[vars[i]], table[vars[j]]) elif method == 'kendall': r, p = stats.kendalltau(table[vars[i]], table[vars[j]]) result_arr.append([vars[i], vars[j], r, p]) df_result = pd.DataFrame(result_arr, columns=['x', 'y', 'corr', 'p_value']) def corr(x, y, **kwargs): if kwargs['method'] == 'pearson': r, p = stats.pearsonr(x, y) elif kwargs['method'] == 'spearman': r, p = stats.spearmanr(x, y) elif kwargs['method'] == 'kendall': r, p = stats.kendalltau(x, y) p_stars = '' if p <= 0.05: p_stars = '*' if p <= 0.01: p_stars = '**' if p <= 0.001: p_stars = '***' corr_text = '{:.{prec}f}'.format(r, prec=corr_prec) font_size = abs(r) * 15 * 2 / corr_prec + 5 ax = plt.gca() ax.annotate(corr_text, [ .5, .5, ], xycoords="axes fraction", ha='center', va='center', fontsize=font_size * height) ax.annotate(p_stars, xy=(0.65, 0.6), xycoords=ax.transAxes, color='red', fontsize=17 * height) g = sns.PairGrid(table, vars=vars, height=height) g.map_diag(sns.distplot) if method == 'pearson': g.map_lower(sns.regplot, scatter_kws=scatter_kws) else: g.map_lower(sns.regplot, lowess=True, scatter_kws=scatter_kws) g.map_upper(corr, method=method) fig_corr = plt2MD(plt) plt.clf() rb = BrtcReprBuilder() rb.addMD( strip_margin(""" ## Correlation Results | ### Correlation Matrix | {fig_corr} | | ### Correlation Table | {table} """.format(fig_corr=fig_corr, table=pandasDF2MD(df_result)))) params = {'vars': vars, 'method': method, 'height': height} res = dict() res['params'] = params res['corr_table'] = df_result res['_repr_brtc_'] = rb.get() return {'result': res}
def _kmeans_train_predict(table, input_cols, n_clusters=3, prediction_col='prediction', init='k-means++', n_init=10, max_iter=300, tol=1e-4, precompute_distances='auto', seed=None, n_jobs=1, algorithm='auto', n_samples=None): inputarr = table[input_cols] if n_samples is None: n_samples = len(inputarr) validate(greater_than_or_equal_to(n_clusters, 1, 'n_clusters'), greater_than_or_equal_to(n_init, 1, 'n_init'), greater_than_or_equal_to(max_iter, 1, 'max_iter'), greater_than(tol, 0.0, 'tol'), greater_than_or_equal_to(n_jobs, 1, 'n_jobs'), greater_than_or_equal_to(n_samples, 0, 'n_samples')) k_means = SKKMeans(n_clusters=n_clusters, init=init, n_init=n_init, max_iter=max_iter, tol=tol, precompute_distances=precompute_distances, verbose=0, random_state=seed, copy_x=True, n_jobs=n_jobs, algorithm=algorithm) k_means.fit(inputarr) params = { 'input_cols': input_cols, 'n_clusters': n_clusters, 'init': init, 'n_init': n_init, 'max_iter': max_iter, 'tol': tol, 'precompute_distances': precompute_distances, 'seed': seed, 'n_jobs': n_jobs, 'algorithm': algorithm } cluster_centers = k_means.cluster_centers_ labels = k_means.labels_ pca2_model = PCA(n_components=2).fit(inputarr) pca2 = pca2_model.transform(inputarr) fig_centers = _kmeans_centers_plot(input_cols, cluster_centers) fig_samples = _kmeans_samples_plot(table, input_cols, n_samples, cluster_centers) fig_pca = _kmeans_pca_plot(labels, cluster_centers, pca2_model, pca2) rb = BrtcReprBuilder() rb.addMD( strip_margin(""" | ## Kmeans Result | - Number of iterations run: {n_iter_}. | - Coordinates of cluster centers | {fig_cluster_centers} | - Samples | {fig_pca} | {fig_samples} | | ### Parameters | {params} """.format(n_iter_=k_means.n_iter_, fig_cluster_centers=fig_centers, fig_pca=fig_pca, fig_samples=fig_samples, params=dict2MD(params)))) model = _model_dict('kmeans') model['model'] = k_means model['input_cols'] = input_cols model['_repr_brtc_'] = rb.get() out_table = table.copy() out_table[prediction_col] = labels return {'out_table': out_table, 'model': model}