Python greater_thanの例、brightics.function.validation.greater_than Pythonの例

コード例 #1

0

ファイルを表示

def _kmeans_train_predict(table, input_cols, n_clusters=3, prediction_col='prediction', init='k-means++', n_init=10,
             max_iter=300, tol=1e-4, precompute_distances='auto', seed=None,
             n_jobs=1, algorithm='auto', n_samples=None):
    inputarr = table[input_cols]
    if n_samples is None:
        n_samples = len(inputarr)
        
    validate(greater_than_or_equal_to(n_clusters, 1, 'n_clusters'),
             greater_than_or_equal_to(n_init, 1, 'n_init'),
             greater_than_or_equal_to(max_iter, 1, 'max_iter'),
             greater_than(tol, 0.0, 'tol'),
             greater_than_or_equal_to(n_jobs, 1, 'n_jobs'),
             greater_than_or_equal_to(n_samples, 0, 'n_samples'))
        
    k_means = SKKMeans(n_clusters=n_clusters, init=init, n_init=n_init,
             max_iter=max_iter, tol=tol, precompute_distances=precompute_distances,
             verbose=0, random_state=seed, copy_x=True, n_jobs=n_jobs, algorithm=algorithm)
    
    k_means.fit(inputarr)
    
    params = {'input_cols':input_cols, 'n_clusters':n_clusters, 'init':init, 'n_init':n_init, 'max_iter':max_iter, 'tol':tol,
              'precompute_distances':precompute_distances, 'seed':seed, 'n_jobs':n_jobs, 'algorithm':algorithm}
    
    cluster_centers = k_means.cluster_centers_
    labels = k_means.labels_
    
    pca2_model = PCA(n_components=2).fit(inputarr)
    pca2 = pca2_model.transform(inputarr)
    
    fig_centers = _kmeans_centers_plot(input_cols, cluster_centers)
    fig_samples = _kmeans_samples_plot(table, input_cols, n_samples, cluster_centers)
    fig_pca = _kmeans_pca_plot(labels, cluster_centers, pca2_model, pca2)
    
    rb = BrtcReprBuilder()
    rb.addMD(strip_margin("""
    | ## Kmeans Result
    | - Number of iterations run: {n_iter_}.
    | - Coordinates of cluster centers
    | {fig_cluster_centers} 
    | - Samples
    | {fig_pca}
    | {fig_samples}
    |
    | ### Parameters
    | {params}
    """.format(n_iter_=k_means.n_iter_, fig_cluster_centers=fig_centers, fig_pca=fig_pca, fig_samples=fig_samples, params=dict2MD(params))))
    
    model = _model_dict('kmeans')
    model['model'] = k_means
    model['input_cols'] = input_cols
    model['_repr_brtc_'] = rb.get()
    
    out_table = table.copy()
    out_table[prediction_col] = labels
    return {'out_table':out_table, 'model':model}

コード例 #2

0

ファイルを表示

ファイル: svm_classification.py プロジェクト: eastshine94/studio

def _svm_classification_train(table,
                              feature_cols,
                              label_col,
                              c=1.0,
                              kernel='rbf',
                              degree=3,
                              gamma='auto',
                              coef0=0.0,
                              shrinking=True,
                              probability=True,
                              tol=1e-3,
                              max_iter=-1,
                              random_state=None):
    validate(greater_than(c, 0.0, 'c'))

    _table = table.copy()

    _feature_cols = _table[feature_cols]
    _label_col = _table[label_col]

    if (sklearn_utils.multiclass.type_of_target(_label_col) == 'continuous'):
        raise_runtime_error('''Label Column should not be continuous.''')

    _svc = svm.SVC(C=c,
                   kernel=kernel,
                   degree=degree,
                   gamma=gamma,
                   coef0=coef0,
                   shrinking=shrinking,
                   probability=probability,
                   tol=tol,
                   max_iter=max_iter,
                   random_state=random_state)
    _svc_model = _svc.fit(_feature_cols, _label_col)

    get_param = _svc.get_params()
    get_param['feature_cols'] = feature_cols
    get_param['label_col'] = label_col

    rb = BrtcReprBuilder()
    rb.addMD(
        strip_margin("""
    | ## SVM Classification Result
    | ### Parameters
    | {table_parameter} 
    """.format(table_parameter=dict2MD(get_param))))

    _model = _model_dict('svc_model')
    _model['svc_model'] = _svc_model
    _model['features'] = feature_cols
    _model['_repr_brtc_'] = rb.get()

    return {'model': _model}

コード例 #3

0

ファイルを表示

def split_data(table,
               train_ratio=7.0,
               test_ratio=3.0,
               random_state=None,
               shuffle=True,
               stratify=None):
    validate(greater_than(train_ratio, 0.0, 'train_ratio'),
             greater_than(test_ratio, 0.0, 'test_ratio'))

    ratio = test_ratio / (train_ratio + test_ratio)
    out_table_train, out_table_test = sktrain_test_split(
        table,
        test_size=ratio,
        random_state=random_state,
        shuffle=shuffle,
        stratify=stratify)

    return {
        'train_table': out_table_train.reset_index(),
        'test_table': out_table_test.reset_index()
    }

コード例 #4

0

ファイルを表示

def _pairplot(table,
              x_vars,
              y_vars=None,
              kind='scatter',
              diag_kind='auto',
              markers=None,
              palette=None,
              height=2.5,
              aspect=1,
              dropna=True,
              hue=None):

    validate(greater_than(height, 0, 'height'),
             greater_than(aspect, 0, 'aspect'))

    s_default = plt.rcParams['lines.markersize']**2.
    plot_kws = {"s": s_default * height / 6.4}

    if y_vars is None:
        y_vars = x_vars

    if kind == 'scatter':
        g = sns.pairplot(table, x_vars=x_vars, y_vars=y_vars, kind=kind, diag_kind=diag_kind, markers=markers, height=height, aspect=aspect, \
                         dropna=dropna, hue=hue, palette=palette, plot_kws=plot_kws)
    else:
        scatter_kws = {'scatter_kws': plot_kws}
        g = sns.pairplot(table, x_vars=x_vars, y_vars=y_vars, kind=kind, diag_kind=diag_kind, markers=markers, height=height, aspect=aspect, \
                         dropna=dropna, hue=hue, palette=palette, plot_kws=scatter_kws)

    if height <= 2.5:
        for ax in g.axes.flatten():
            for label in ax.get_xticklabels():
                label.set_rotation(90 * (2.5 - height))

    rb = BrtcReprBuilder()
    rb.addPlt(plt)
    plt.clf()

    return {'result': {'_repr_brtc_': rb.get()}}

コード例 #5

0

ファイルを表示

def _profile_table(table,
                   bins=10,
                   check_correlation=False,
                   correlation_threshold=0.9,
                   correlation_overrides=None):

    validate(greater_than_or_equal_to(bins, 1, 'bins'),
             greater_than(correlation_threshold, 0.0, 'correlation_threshold'))

    rb = BrtcReprBuilder()

    profile = pd_profiling.ProfileReport(
        table,
        bins=bins,
        check_correlation=check_correlation,
        correlation_threshold=correlation_threshold,
        correlation_overrides=correlation_overrides)
    rb.addHTML(profile.html)
    summary = dict()
    summary['_repr_brtc_'] = rb.get()

    return {'result': summary}

コード例 #6

0

ファイルを表示

def _kmeans_silhouette_train_predict(table, input_cols, n_clusters_list=range(2, 10), prediction_col='prediction', init='k-means++', n_init=10,
             max_iter=300, tol=1e-4, precompute_distances='auto', seed=None,
             n_jobs=1, algorithm='auto', n_samples=None):
    if n_samples is None:
        n_samples = len(table)
    inputarr = table[input_cols]
    
    validate(all_elements_greater_than(n_clusters_list, 1, 'n_clusters_list'),
         greater_than_or_equal_to(n_init, 1, 'n_init'),
         greater_than_or_equal_to(max_iter, 1, 'max_iter'),
         greater_than(tol, 0.0, 'tol'),
         greater_than_or_equal_to(n_jobs, 1, 'n_jobs'),
         greater_than_or_equal_to(n_samples, 0, 'n_samples'))
    
    pca2_model = PCA(n_components=2).fit(inputarr)
    pca2 = pca2_model.transform(inputarr)
    
    silhouette_list = []
    silouette_samples_list = []
    models = []
    centers_list = []
    images = []
    for k in n_clusters_list:
        k_means = SKKMeans(n_clusters=k, init=init, n_init=n_init,
             max_iter=max_iter, tol=tol, precompute_distances=precompute_distances,
             verbose=0, random_state=seed, copy_x=True, n_jobs=n_jobs, algorithm=algorithm)
        k_means.fit(inputarr)
        models.append(k_means)
        predict = k_means.labels_
        centersk = k_means.cluster_centers_
        centers_list.append(centersk)
        
        score = silhouette_score(inputarr, predict)
        silhouette_list.append(score)
        samples = silhouette_samples(inputarr, predict)
        silouette_samples_list.append(samples)
    
        pca2_centers = pca2_model.transform(centersk)

        _, (ax1, ax2) = plt.subplots(1, 2)
        colors = cm.nipy_spectral(np.arange(k).astype(float) / k)
        y_lower = 0

        for i, color in zip(range(k), colors):
            si = samples[predict == i]
            si.sort()

            sizei = si.shape[0]
            y_upper = y_lower + sizei

            ax1.fill_betweenx(np.arange(y_lower, y_upper),
                              0, si,
                              facecolor=color, edgecolor=color, alpha=0.7)

            y_lower = y_upper

            ax2.scatter(pca2[:, 0][predict == i], pca2[:, 1][predict == i], color=color)

        ax1.axvline(x=score, color="red")
        ax2.scatter(pca2_centers[:, 0], pca2_centers[:, 1], marker='x', edgecolors=1, s=200, color=colors)

        imagek = plt2MD(plt)
        plt.clf()
        images.append(imagek)
    
    argmax = np.argmax(silhouette_list)
    best_k = n_clusters_list[argmax]
    best_model = models[argmax]
    predict = best_model.predict(inputarr)
    best_centers = best_model.cluster_centers_
    best_labels = best_model.labels_
    
    fig_centers = _kmeans_centers_plot(input_cols, best_centers)
    fig_samples = _kmeans_samples_plot(table, input_cols, n_samples, best_centers)
    fig_pca = _kmeans_pca_plot(predict, best_centers, pca2_model, pca2)
    
    x_clusters = range(len(n_clusters_list))
    plt.xticks(x_clusters, n_clusters_list)
    plt.plot(x_clusters, silhouette_list, '.-')
    fig_silhouette = plt2MD(plt)
    plt.clf()
    
    rb = BrtcReprBuilder()
    rb.addMD(strip_margin("""
    | ## Kmeans Silhouette Result
    | - silloutte metrics:
    | {fig_silhouette}
    | - best K: {best_k} 
    | - best centers:
    | {fig_pca}
    | {fig_centers}
    | {fig_samples}
    |
    """.format(fig_silhouette=fig_silhouette, best_k=best_k, fig_pca=fig_pca, fig_centers=fig_centers, fig_samples=fig_samples)))

    for k, image in zip(n_clusters_list, images):
        rb.addMD(strip_margin("""
        | ### k = {k}
        | {image}
        |
        """.format(k=k, image=image)))

    model = _model_dict('kmeans_silhouette')
    model['best_k'] = best_k
    model['best_centers'] = best_centers
    model['best_model'] = best_model
    model['input_cols'] = input_cols
    model['_repr_brtc_'] = rb.get()
    
    out_table = table.copy()
    out_table[prediction_col] = predict
    
    return {'out_table':out_table, 'model':model}

コード例 #7

0

ファイルを表示

ファイル: correlation.py プロジェクト: eastshine94/studio

def _correlation(table, vars, method='pearson', height=2.5, corr_prec=2):

    validate(greater_than(height, 0, 'height'),
             greater_than_or_equal_to(corr_prec, 1, 'corr_prec'))

    size = len(vars)

    s_default = plt.rcParams['lines.markersize']**2.
    scatter_kws = {"s": s_default * height / 6.4}

    corr_arr = np.ones((size, size))  # TODO variable name dict
    p_arr = np.zeros((size, size))

    for i in range(size):
        for j in range(i):
            if method == 'pearson':
                r, p = stats.pearsonr(table[vars[i]], table[vars[j]])
            elif method == 'spearman':
                r, p = stats.spearmanr(table[vars[i]], table[vars[j]])
            elif method == 'kendal':
                r, p = stats.kendalltau(table[vars[i]], table[vars[j]])

            corr_arr[i][j] = r
            p_arr[i][j] = p

    for i in range(size):
        for j in range(i, size):
            corr_arr[i][j] = corr_arr[j][i]
            p_arr[i][j] = p_arr[j][i]

    def corr(x, y, **kwargs):
        if kwargs['method'] == 'pearson':
            r, p = stats.pearsonr(x, y)
        elif kwargs['method'] == 'spearman':
            r, p = stats.spearmanr(x, y)
        elif kwargs['method'] == 'kendal':
            r, p = stats.kendalltau(x, y)

        p_stars = ''
        if p <= 0.05:
            p_stars = '*'
        if p <= 0.01:
            p_stars = '**'
        if p <= 0.001:
            p_stars = '***'

        corr_text = '{:.{prec}f}'.format(r, prec=corr_prec)
        print(type(corr_prec))
        font_size = abs(r) * 15 * 2 / corr_prec + 5
        ax = plt.gca()
        ax.annotate(corr_text, [
            .5,
            .5,
        ],
                    xycoords="axes fraction",
                    ha='center',
                    va='center',
                    fontsize=font_size * height)
        ax.annotate(p_stars,
                    xy=(0.65, 0.6),
                    xycoords=ax.transAxes,
                    color='red',
                    fontsize=17 * height)

    g = sns.PairGrid(table, vars=vars, height=height)
    g.map_diag(sns.distplot)
    if method == 'pearson':
        g.map_lower(sns.regplot, scatter_kws=scatter_kws)
    else:
        g.map_lower(sns.regplot, lowess=True, scatter_kws=scatter_kws)
    g.map_upper(corr, method=method)

    rb = BrtcReprBuilder()
    rb.addPlt(plt)
    plt.clf()

    params = {'vars': vars, 'method': method, 'height': height}

    res = dict()
    res['params'] = params
    res['corr'] = corr_arr
    res['pvalue'] = p_arr
    res['_repr_brtc_'] = rb.get()

    return {'result': res}