def _kmeans_train_predict(table, input_cols, n_clusters=3, prediction_col='prediction', init='k-means++', n_init=10, max_iter=300, tol=1e-4, precompute_distances='auto', seed=None, n_jobs=1, algorithm='auto', n_samples=None): inputarr = table[input_cols] if n_samples is None: n_samples = len(inputarr) validate(greater_than_or_equal_to(n_clusters, 1, 'n_clusters'), greater_than_or_equal_to(n_init, 1, 'n_init'), greater_than_or_equal_to(max_iter, 1, 'max_iter'), greater_than(tol, 0.0, 'tol'), greater_than_or_equal_to(n_jobs, 1, 'n_jobs'), greater_than_or_equal_to(n_samples, 0, 'n_samples')) k_means = SKKMeans(n_clusters=n_clusters, init=init, n_init=n_init, max_iter=max_iter, tol=tol, precompute_distances=precompute_distances, verbose=0, random_state=seed, copy_x=True, n_jobs=n_jobs, algorithm=algorithm) k_means.fit(inputarr) params = {'input_cols':input_cols, 'n_clusters':n_clusters, 'init':init, 'n_init':n_init, 'max_iter':max_iter, 'tol':tol, 'precompute_distances':precompute_distances, 'seed':seed, 'n_jobs':n_jobs, 'algorithm':algorithm} cluster_centers = k_means.cluster_centers_ labels = k_means.labels_ pca2_model = PCA(n_components=2).fit(inputarr) pca2 = pca2_model.transform(inputarr) fig_centers = _kmeans_centers_plot(input_cols, cluster_centers) fig_samples = _kmeans_samples_plot(table, input_cols, n_samples, cluster_centers) fig_pca = _kmeans_pca_plot(labels, cluster_centers, pca2_model, pca2) rb = BrtcReprBuilder() rb.addMD(strip_margin(""" | ## Kmeans Result | - Number of iterations run: {n_iter_}. | - Coordinates of cluster centers | {fig_cluster_centers} | - Samples | {fig_pca} | {fig_samples} | | ### Parameters | {params} """.format(n_iter_=k_means.n_iter_, fig_cluster_centers=fig_centers, fig_pca=fig_pca, fig_samples=fig_samples, params=dict2MD(params)))) model = _model_dict('kmeans') model['model'] = k_means model['input_cols'] = input_cols model['_repr_brtc_'] = rb.get() out_table = table.copy() out_table[prediction_col] = labels return {'out_table':out_table, 'model':model}
def _svm_classification_train(table, feature_cols, label_col, c=1.0, kernel='rbf', degree=3, gamma='auto', coef0=0.0, shrinking=True, probability=True, tol=1e-3, max_iter=-1, random_state=None): validate(greater_than(c, 0.0, 'c')) _table = table.copy() _feature_cols = _table[feature_cols] _label_col = _table[label_col] if (sklearn_utils.multiclass.type_of_target(_label_col) == 'continuous'): raise_runtime_error('''Label Column should not be continuous.''') _svc = svm.SVC(C=c, kernel=kernel, degree=degree, gamma=gamma, coef0=coef0, shrinking=shrinking, probability=probability, tol=tol, max_iter=max_iter, random_state=random_state) _svc_model = _svc.fit(_feature_cols, _label_col) get_param = _svc.get_params() get_param['feature_cols'] = feature_cols get_param['label_col'] = label_col rb = BrtcReprBuilder() rb.addMD( strip_margin(""" | ## SVM Classification Result | ### Parameters | {table_parameter} """.format(table_parameter=dict2MD(get_param)))) _model = _model_dict('svc_model') _model['svc_model'] = _svc_model _model['features'] = feature_cols _model['_repr_brtc_'] = rb.get() return {'model': _model}
def split_data(table, train_ratio=7.0, test_ratio=3.0, random_state=None, shuffle=True, stratify=None): validate(greater_than(train_ratio, 0.0, 'train_ratio'), greater_than(test_ratio, 0.0, 'test_ratio')) ratio = test_ratio / (train_ratio + test_ratio) out_table_train, out_table_test = sktrain_test_split( table, test_size=ratio, random_state=random_state, shuffle=shuffle, stratify=stratify) return { 'train_table': out_table_train.reset_index(), 'test_table': out_table_test.reset_index() }
def _pairplot(table, x_vars, y_vars=None, kind='scatter', diag_kind='auto', markers=None, palette=None, height=2.5, aspect=1, dropna=True, hue=None): validate(greater_than(height, 0, 'height'), greater_than(aspect, 0, 'aspect')) s_default = plt.rcParams['lines.markersize']**2. plot_kws = {"s": s_default * height / 6.4} if y_vars is None: y_vars = x_vars if kind == 'scatter': g = sns.pairplot(table, x_vars=x_vars, y_vars=y_vars, kind=kind, diag_kind=diag_kind, markers=markers, height=height, aspect=aspect, \ dropna=dropna, hue=hue, palette=palette, plot_kws=plot_kws) else: scatter_kws = {'scatter_kws': plot_kws} g = sns.pairplot(table, x_vars=x_vars, y_vars=y_vars, kind=kind, diag_kind=diag_kind, markers=markers, height=height, aspect=aspect, \ dropna=dropna, hue=hue, palette=palette, plot_kws=scatter_kws) if height <= 2.5: for ax in g.axes.flatten(): for label in ax.get_xticklabels(): label.set_rotation(90 * (2.5 - height)) rb = BrtcReprBuilder() rb.addPlt(plt) plt.clf() return {'result': {'_repr_brtc_': rb.get()}}
def _profile_table(table, bins=10, check_correlation=False, correlation_threshold=0.9, correlation_overrides=None): validate(greater_than_or_equal_to(bins, 1, 'bins'), greater_than(correlation_threshold, 0.0, 'correlation_threshold')) rb = BrtcReprBuilder() profile = pd_profiling.ProfileReport( table, bins=bins, check_correlation=check_correlation, correlation_threshold=correlation_threshold, correlation_overrides=correlation_overrides) rb.addHTML(profile.html) summary = dict() summary['_repr_brtc_'] = rb.get() return {'result': summary}
def _kmeans_silhouette_train_predict(table, input_cols, n_clusters_list=range(2, 10), prediction_col='prediction', init='k-means++', n_init=10, max_iter=300, tol=1e-4, precompute_distances='auto', seed=None, n_jobs=1, algorithm='auto', n_samples=None): if n_samples is None: n_samples = len(table) inputarr = table[input_cols] validate(all_elements_greater_than(n_clusters_list, 1, 'n_clusters_list'), greater_than_or_equal_to(n_init, 1, 'n_init'), greater_than_or_equal_to(max_iter, 1, 'max_iter'), greater_than(tol, 0.0, 'tol'), greater_than_or_equal_to(n_jobs, 1, 'n_jobs'), greater_than_or_equal_to(n_samples, 0, 'n_samples')) pca2_model = PCA(n_components=2).fit(inputarr) pca2 = pca2_model.transform(inputarr) silhouette_list = [] silouette_samples_list = [] models = [] centers_list = [] images = [] for k in n_clusters_list: k_means = SKKMeans(n_clusters=k, init=init, n_init=n_init, max_iter=max_iter, tol=tol, precompute_distances=precompute_distances, verbose=0, random_state=seed, copy_x=True, n_jobs=n_jobs, algorithm=algorithm) k_means.fit(inputarr) models.append(k_means) predict = k_means.labels_ centersk = k_means.cluster_centers_ centers_list.append(centersk) score = silhouette_score(inputarr, predict) silhouette_list.append(score) samples = silhouette_samples(inputarr, predict) silouette_samples_list.append(samples) pca2_centers = pca2_model.transform(centersk) _, (ax1, ax2) = plt.subplots(1, 2) colors = cm.nipy_spectral(np.arange(k).astype(float) / k) y_lower = 0 for i, color in zip(range(k), colors): si = samples[predict == i] si.sort() sizei = si.shape[0] y_upper = y_lower + sizei ax1.fill_betweenx(np.arange(y_lower, y_upper), 0, si, facecolor=color, edgecolor=color, alpha=0.7) y_lower = y_upper ax2.scatter(pca2[:, 0][predict == i], pca2[:, 1][predict == i], color=color) ax1.axvline(x=score, color="red") ax2.scatter(pca2_centers[:, 0], pca2_centers[:, 1], marker='x', edgecolors=1, s=200, color=colors) imagek = plt2MD(plt) plt.clf() images.append(imagek) argmax = np.argmax(silhouette_list) best_k = n_clusters_list[argmax] best_model = models[argmax] predict = best_model.predict(inputarr) best_centers = best_model.cluster_centers_ best_labels = best_model.labels_ fig_centers = _kmeans_centers_plot(input_cols, best_centers) fig_samples = _kmeans_samples_plot(table, input_cols, n_samples, best_centers) fig_pca = _kmeans_pca_plot(predict, best_centers, pca2_model, pca2) x_clusters = range(len(n_clusters_list)) plt.xticks(x_clusters, n_clusters_list) plt.plot(x_clusters, silhouette_list, '.-') fig_silhouette = plt2MD(plt) plt.clf() rb = BrtcReprBuilder() rb.addMD(strip_margin(""" | ## Kmeans Silhouette Result | - silloutte metrics: | {fig_silhouette} | - best K: {best_k} | - best centers: | {fig_pca} | {fig_centers} | {fig_samples} | """.format(fig_silhouette=fig_silhouette, best_k=best_k, fig_pca=fig_pca, fig_centers=fig_centers, fig_samples=fig_samples))) for k, image in zip(n_clusters_list, images): rb.addMD(strip_margin(""" | ### k = {k} | {image} | """.format(k=k, image=image))) model = _model_dict('kmeans_silhouette') model['best_k'] = best_k model['best_centers'] = best_centers model['best_model'] = best_model model['input_cols'] = input_cols model['_repr_brtc_'] = rb.get() out_table = table.copy() out_table[prediction_col] = predict return {'out_table':out_table, 'model':model}
def _correlation(table, vars, method='pearson', height=2.5, corr_prec=2): validate(greater_than(height, 0, 'height'), greater_than_or_equal_to(corr_prec, 1, 'corr_prec')) size = len(vars) s_default = plt.rcParams['lines.markersize']**2. scatter_kws = {"s": s_default * height / 6.4} corr_arr = np.ones((size, size)) # TODO variable name dict p_arr = np.zeros((size, size)) for i in range(size): for j in range(i): if method == 'pearson': r, p = stats.pearsonr(table[vars[i]], table[vars[j]]) elif method == 'spearman': r, p = stats.spearmanr(table[vars[i]], table[vars[j]]) elif method == 'kendal': r, p = stats.kendalltau(table[vars[i]], table[vars[j]]) corr_arr[i][j] = r p_arr[i][j] = p for i in range(size): for j in range(i, size): corr_arr[i][j] = corr_arr[j][i] p_arr[i][j] = p_arr[j][i] def corr(x, y, **kwargs): if kwargs['method'] == 'pearson': r, p = stats.pearsonr(x, y) elif kwargs['method'] == 'spearman': r, p = stats.spearmanr(x, y) elif kwargs['method'] == 'kendal': r, p = stats.kendalltau(x, y) p_stars = '' if p <= 0.05: p_stars = '*' if p <= 0.01: p_stars = '**' if p <= 0.001: p_stars = '***' corr_text = '{:.{prec}f}'.format(r, prec=corr_prec) print(type(corr_prec)) font_size = abs(r) * 15 * 2 / corr_prec + 5 ax = plt.gca() ax.annotate(corr_text, [ .5, .5, ], xycoords="axes fraction", ha='center', va='center', fontsize=font_size * height) ax.annotate(p_stars, xy=(0.65, 0.6), xycoords=ax.transAxes, color='red', fontsize=17 * height) g = sns.PairGrid(table, vars=vars, height=height) g.map_diag(sns.distplot) if method == 'pearson': g.map_lower(sns.regplot, scatter_kws=scatter_kws) else: g.map_lower(sns.regplot, lowess=True, scatter_kws=scatter_kws) g.map_upper(corr, method=method) rb = BrtcReprBuilder() rb.addPlt(plt) plt.clf() params = {'vars': vars, 'method': method, 'height': height} res = dict() res['params'] = params res['corr'] = corr_arr res['pvalue'] = p_arr res['_repr_brtc_'] = rb.get() return {'result': res}