Ejemplo n.º 1
0
def tfidf(table, group_by=None, **params):  # This will be deprecated.
    check_required_parameters(_tfidf, params, ['table'])
    params = get_default_from_parameters_if_required(params, _tfidf)
    param_validation_check = [greater_than_or_equal_to(params, 0, 'min_df'),
                              greater_than_or_equal_to(params, 2, 'num_voca'),
                              greater_than(params, 0, 'max_df')]
    validate(*param_validation_check)
    if group_by is not None:
        return _function_by_group(_tfidf, table, group_by=group_by, **params)
    else:
        return _tfidf(table, **params)
Ejemplo n.º 2
0
def mean_shift(table, group_by=None, **params):
    check_required_parameters(_mean_shift, params, ['table'])
    
    params = get_default_from_parameters_if_required(params, _mean_shift)
    param_validation_check = [greater_than(params, 0.0, 'bandwidth')]
    validate(*param_validation_check)
    
    if group_by is not None:
        grouped_model = _function_by_group(_mean_shift, table, group_by=group_by, **params)
        return grouped_model
    else:
        return _mean_shift(table, **params)
Ejemplo n.º 3
0
def _split_data(table,
                train_ratio=7.0,
                test_ratio=3.0,
                random_state=None,
                shuffle=True,
                stratify=None):
    validate(greater_than(train_ratio, 0.0, 'train_ratio'),
             greater_than(test_ratio, 0.0, 'test_ratio'))

    ratio = test_ratio / (train_ratio + test_ratio)
    out_table_train, out_table_test = sktrain_test_split(
        table,
        test_size=ratio,
        random_state=random_state,
        shuffle=shuffle,
        stratify=stratify)

    return {
        'train_table': out_table_train.reset_index(drop=True),
        'test_table': out_table_test.reset_index(drop=True)
    }
Ejemplo n.º 4
0
def profile_table(table, group_by=None, **params):
    check_required_parameters(_profile_table, params, ['table'])
    
    params = get_default_from_parameters_if_required(params, _profile_table)
    param_validation_check = [greater_than_or_equal_to(params, 1, 'bins'),
             greater_than(params, 0.0, 'correlation_threshold')]
    validate(*param_validation_check)
    
    if group_by is not None:
        return _function_by_group(_profile_table, table, group_by=group_by, **params)
    else:
        return _profile_table(table, **params)
Ejemplo n.º 5
0
def _pairplot(table,
              x_vars,
              y_vars=None,
              kind='scatter',
              diag_kind='auto',
              markers=None,
              palette=None,
              height=2.5,
              aspect=1,
              dropna=True,
              hue=None):

    validate(greater_than(height, 0, 'height'),
             greater_than(aspect, 0, 'aspect'))

    s_default = plt.rcParams['lines.markersize']**2.
    plot_kws = {"s": s_default * height / 6.4}

    if y_vars is None:
        y_vars = x_vars

    if kind == 'scatter':
        g = sns.pairplot(table, x_vars=x_vars, y_vars=y_vars, kind=kind, diag_kind=diag_kind, markers=markers, height=height, aspect=aspect, \
                         dropna=dropna, hue=hue, palette=palette, plot_kws=plot_kws)
    else:
        scatter_kws = {'scatter_kws': plot_kws}
        g = sns.pairplot(table, x_vars=x_vars, y_vars=y_vars, kind=kind, diag_kind=diag_kind, markers=markers, height=height, aspect=aspect, \
                         dropna=dropna, hue=hue, palette=palette, plot_kws=scatter_kws)

    if height <= 2.5:
        for ax in g.axes.flatten():
            for label in ax.get_xticklabels():
                label.set_rotation(90 * (2.5 - height))

    rb = BrtcReprBuilder()
    rb.addPlt(plt)
    plt.clf()

    return {'result': {'_repr_brtc_': rb.get()}}
Ejemplo n.º 6
0
def bow(table, group_by=None, **params):
    check_required_parameters(_bow, params, ['table'])
    params = get_default_from_parameters_if_required(params, _bow)
    param_validation_check = [
        greater_than_or_equal_to(params, 0, 'no_below'),
        less_than_or_equal_to(params, 1.0, 'no_above'),
        greater_than(params, 0.0, 'no_above'),
        greater_than_or_equal_to(params, 1, 'keep_n')
    ]
    validate(*param_validation_check)
    if group_by is not None:
        return _function_by_group(_bow, table, group_by=group_by, **params)
    else:
        return _bow(table, **params)
Ejemplo n.º 7
0
def naive_bayes_train(table, group_by=None, **params):
    params = get_default_from_parameters_if_required(params,
                                                     _naive_bayes_train)
    param_validation_check = [greater_than(params, 0, 'alpha')]

    validate(*param_validation_check)
    check_required_parameters(_naive_bayes_train, params, ['table'])
    if group_by is not None:
        return _function_by_group(_naive_bayes_train,
                                  table,
                                  group_by=group_by,
                                  **params)
    else:
        return _naive_bayes_train(table, **params)
Ejemplo n.º 8
0
def lda(table, group_by=None, **params):
    check_required_parameters(_lda, params, ['table'])
    params = get_default_from_parameters_if_required(params, _lda)
    param_validation_check = [greater_than_or_equal_to(params, 2, 'num_voca'),
                              greater_than_or_equal_to(params, 2, 'num_topic'),
                              from_to(params, 2, params['num_voca'], 'num_topic_word'),
                              greater_than_or_equal_to(params, 1, 'max_iter'),
                              greater_than(params, 1.0, 'learning_offset')]
    
    validate(*param_validation_check)
    if group_by is not None:
        return _function_by_group(_lda, table, group_by=group_by, **params)
    else:
        return _lda(table, **params)
Ejemplo n.º 9
0
def correlation(table, group_by=None, **params):
    check_required_parameters(_correlation, params, ['table'])
    params = get_default_from_parameters_if_required(params, _correlation)
    param_validation_check = [
        greater_than(params, 0, 'height'),
        greater_than_or_equal_to(params, 1, 'corr_prec')
    ]
    validate(*param_validation_check)
    if group_by is not None:
        return _function_by_group(_correlation,
                                  table,
                                  group_by=group_by,
                                  **params)
    else:
        return _correlation(table, **params)
Ejemplo n.º 10
0
def outlier_detection_tukey_carling(table, group_by=None, **params):
    check_required_parameters(_outlier_detection_tukey_carling, params,
                              ['table'])
    params = get_default_from_parameters_if_required(
        params, _outlier_detection_tukey_carling)
    param_validation_check = [greater_than(params, 0.0, 'multiplier')]
    validate(*param_validation_check)

    if group_by is not None:
        return _function_by_group(_outlier_detection_tukey_carling,
                                  table,
                                  group_by=group_by,
                                  **params)
    else:
        return _outlier_detection_tukey_carling(table, **params)
Ejemplo n.º 11
0
def kmeans_silhouette_train_predict(table, group_by=None, **params):
    check_required_parameters(_kmeans_silhouette_train_predict, params, ['table'])
    params = get_default_from_parameters_if_required(params, _kmeans_silhouette_train_predict)
    param_validation_check = [all_elements_greater_than(params, 1, 'n_clusters_list'),
                              greater_than_or_equal_to(params, 1, 'n_init'),
                              greater_than_or_equal_to(params, 1, 'max_iter'),
                              greater_than(params, 0.0, 'tol'),
                              greater_than_or_equal_to(params, 1, 'n_jobs'),
                              greater_than_or_equal_to(params, 0, 'n_samples')]
    validate(*param_validation_check)

    if group_by is not None:
        grouped_model = _function_by_group(_kmeans_silhouette_train_predict, table, group_by=group_by, **params) 
        return grouped_model
    else:
        return _kmeans_silhouette_train_predict(table, **params)
Ejemplo n.º 12
0
def decision_tree_regression_train(table, group_by=None, **params):
    check_required_parameters(_decision_tree_regression_train, params, ['table'])
    params = get_default_from_parameters_if_required(params, _decision_tree_regression_train)
    param_validation_check = [greater_than_or_equal_to(params, 2, 'min_samples_split'),
                              greater_than_or_equal_to(params, 1, 'min_samples_leaf'),
                              greater_than_or_equal_to(params, 0.0, 'min_weight_fraction_leaf'),
                              greater_than_or_equal_to(params, 1, 'max_depth'),
                              greater_than_or_equal_to(params, 1, 'max_features'),
                              greater_than(params, 1, 'max_leaf_nodes'),
                              greater_than_or_equal_to(params, 0.0, 'min_impurity_split')]
    
    validate(*param_validation_check)
    if group_by is not None:
        grouped_model = _function_by_group(_decision_tree_regression_train, table, group_by=group_by, **params)
        return grouped_model
    else:
        return _decision_tree_regression_train(table, **params)
Ejemplo n.º 13
0
def ada_boost_regression_train(table, group_by=None, **params):
    check_required_parameters(_ada_boost_regression_train, params, ['table'])
    params = get_default_from_parameters_if_required(
        params, _ada_boost_regression_train)
    param_validation_check = [
        greater_than_or_equal_to(params, 2, 'max_depth'),
        greater_than_or_equal_to(params, 1, 'n_estimators'),
        greater_than(params, 0, 'learning_rate')
    ]
    validate(*param_validation_check)
    if group_by is not None:
        return _function_by_group(_ada_boost_regression_train,
                                  table,
                                  group_by=group_by,
                                  **params)
    else:
        return _ada_boost_regression_train(table, **params)
Ejemplo n.º 14
0
def svm_classification_train(table, group_by=None, **params):
    check_required_parameters(_svm_classification_train, params, ['table'])
    params = get_default_from_parameters_if_required(
        params, _svm_classification_train)
    param_validation_check = [
        over_to(params, 0.0, 1.0, 'c'),
        greater_than_or_equal_to(params, 0, 'degree'),
        greater_than(params, 0.0, 'tol')
    ]
    validate(*param_validation_check)

    if group_by is not None:
        grouped_model = _function_by_group(_svm_classification_train,
                                           table,
                                           group_by=group_by,
                                           **params)
        return grouped_model
    else:
        return _svm_classification_train(table, **params)
Ejemplo n.º 15
0
def penalized_linear_regression_train(table, group_by=None, **params):
    check_required_parameters(_penalized_linear_regression_train, params,
                              ['table'])
    params = get_default_from_parameters_if_required(
        params, _penalized_linear_regression_train)
    param_validation_check = [
        greater_than_or_equal_to(params, 0.0, 'alpha'),
        from_to(params, 0.0, 1.0, 'l1_ratio'),
        greater_than_or_equal_to(params, 1, 'max_iter'),
        greater_than(params, 0.0, 'tol')
    ]
    validate(*param_validation_check)
    if group_by is not None:
        grouped_model = _function_by_group(_penalized_linear_regression_train,
                                           table,
                                           group_by=group_by,
                                           **params)
        return grouped_model
    else:
        return _penalized_linear_regression_train(table, **params)
Ejemplo n.º 16
0
def _profile_table(table,
                   bins=10,
                   check_correlation=False,
                   correlation_threshold=0.9,
                   correlation_overrides=None):

    validate(greater_than_or_equal_to(bins, 1, 'bins'),
             greater_than(correlation_threshold, 0.0, 'correlation_threshold'))

    rb = BrtcReprBuilder()

    profile = pd_profiling.ProfileReport(
        table,
        bins=bins,
        check_correlation=check_correlation,
        correlation_threshold=correlation_threshold,
        correlation_overrides=correlation_overrides)
    rb.addHTML(profile.html)
    summary = dict()
    summary['_repr_brtc_'] = rb.get()

    return {'result': summary}
Ejemplo n.º 17
0
def _kmeans_silhouette_train_predict(table,
                                     input_cols,
                                     n_clusters_list=range(2, 10),
                                     prediction_col='prediction',
                                     init='k-means++',
                                     n_init=10,
                                     max_iter=300,
                                     tol=1e-4,
                                     precompute_distances='auto',
                                     seed=None,
                                     n_jobs=1,
                                     algorithm='auto',
                                     n_samples=None):
    if n_samples is None:
        n_samples = len(table)
    inputarr = table[input_cols]

    validate(all_elements_greater_than(n_clusters_list, 1, 'n_clusters_list'),
             greater_than_or_equal_to(n_init, 1, 'n_init'),
             greater_than_or_equal_to(max_iter, 1, 'max_iter'),
             greater_than(tol, 0.0, 'tol'),
             greater_than_or_equal_to(n_jobs, 1, 'n_jobs'),
             greater_than_or_equal_to(n_samples, 0, 'n_samples'))

    pca2_model = PCA(n_components=2).fit(inputarr)
    pca2 = pca2_model.transform(inputarr)

    silhouette_list = []
    silouette_samples_list = []
    models = []
    centers_list = []
    images = []
    for k in n_clusters_list:
        k_means = SKKMeans(n_clusters=k,
                           init=init,
                           n_init=n_init,
                           max_iter=max_iter,
                           tol=tol,
                           precompute_distances=precompute_distances,
                           verbose=0,
                           random_state=seed,
                           copy_x=True,
                           n_jobs=n_jobs,
                           algorithm=algorithm)
        k_means.fit(inputarr)
        models.append(k_means)
        predict = k_means.labels_
        centersk = k_means.cluster_centers_
        centers_list.append(centersk)

        score = silhouette_score(inputarr, predict)
        silhouette_list.append(score)
        samples = silhouette_samples(inputarr, predict)
        silouette_samples_list.append(samples)

        pca2_centers = pca2_model.transform(centersk)

        _, (ax1, ax2) = plt.subplots(1, 2)
        colors = cm.nipy_spectral(np.arange(k).astype(float) / k)
        y_lower = 0

        for i, color in zip(range(k), colors):
            si = samples[predict == i]
            si.sort()

            sizei = si.shape[0]
            y_upper = y_lower + sizei

            ax1.fill_betweenx(np.arange(y_lower, y_upper),
                              0,
                              si,
                              facecolor=color,
                              edgecolor=color,
                              alpha=0.7)

            y_lower = y_upper

            ax2.scatter(pca2[:, 0][predict == i],
                        pca2[:, 1][predict == i],
                        color=color)

        ax1.axvline(x=score, color="red")
        ax2.scatter(pca2_centers[:, 0],
                    pca2_centers[:, 1],
                    marker='x',
                    edgecolors=1,
                    s=200,
                    color=colors)

        imagek = plt2MD(plt)
        plt.clf()
        images.append(imagek)

    argmax = np.argmax(silhouette_list)
    best_k = n_clusters_list[argmax]
    best_model = models[argmax]
    predict = best_model.predict(inputarr)
    best_centers = best_model.cluster_centers_
    best_labels = best_model.labels_

    fig_centers = _kmeans_centers_plot(input_cols, best_centers)
    fig_samples = _kmeans_samples_plot(table, input_cols, n_samples,
                                       best_centers)
    fig_pca = _kmeans_pca_plot(predict, best_centers, pca2_model, pca2)

    x_clusters = range(len(n_clusters_list))
    plt.xticks(x_clusters, n_clusters_list)
    plt.plot(x_clusters, silhouette_list, '.-')
    fig_silhouette = plt2MD(plt)
    plt.clf()

    rb = BrtcReprBuilder()
    rb.addMD(
        strip_margin("""
    | ## Kmeans Silhouette Result
    | - silloutte metrics:
    | {fig_silhouette}
    | - best K: {best_k} 
    | - best centers:
    | {fig_pca}
    | {fig_centers}
    | {fig_samples}
    |
    """.format(fig_silhouette=fig_silhouette,
               best_k=best_k,
               fig_pca=fig_pca,
               fig_centers=fig_centers,
               fig_samples=fig_samples)))

    for k, image in zip(n_clusters_list, images):
        rb.addMD(
            strip_margin("""
        | ### k = {k}
        | {image}
        |
        """.format(k=k, image=image)))

    model = _model_dict('kmeans_silhouette')
    model['best_k'] = best_k
    model['best_centers'] = best_centers
    model['best_model'] = best_model
    model['input_cols'] = input_cols
    model['_repr_brtc_'] = rb.get()

    out_table = table.copy()
    out_table[prediction_col] = predict

    return {'out_table': out_table, 'model': model}
Ejemplo n.º 18
0
def _correlation(table, vars, method='pearson', height=2.5, corr_prec=2):

    validate(greater_than(height, 0, 'height'),
             greater_than_or_equal_to(corr_prec, 1, 'corr_prec'))

    size = len(vars)

    s_default = plt.rcParams['lines.markersize']**2.
    scatter_kws = {"s": s_default * height / 6.4}

    result_arr = []

    for i in range(size):
        for j in range(i):
            if method == 'pearson':
                r, p = stats.pearsonr(table[vars[i]], table[vars[j]])
            elif method == 'spearman':
                r, p = stats.spearmanr(table[vars[i]], table[vars[j]])
            elif method == 'kendall':
                r, p = stats.kendalltau(table[vars[i]], table[vars[j]])

            result_arr.append([vars[i], vars[j], r, p])

    df_result = pd.DataFrame(result_arr, columns=['x', 'y', 'corr', 'p_value'])

    def corr(x, y, **kwargs):
        if kwargs['method'] == 'pearson':
            r, p = stats.pearsonr(x, y)
        elif kwargs['method'] == 'spearman':
            r, p = stats.spearmanr(x, y)
        elif kwargs['method'] == 'kendall':
            r, p = stats.kendalltau(x, y)

        p_stars = ''
        if p <= 0.05:
            p_stars = '*'
        if p <= 0.01:
            p_stars = '**'
        if p <= 0.001:
            p_stars = '***'

        corr_text = '{:.{prec}f}'.format(r, prec=corr_prec)
        font_size = abs(r) * 15 * 2 / corr_prec + 5
        ax = plt.gca()
        ax.annotate(corr_text, [
            .5,
            .5,
        ],
                    xycoords="axes fraction",
                    ha='center',
                    va='center',
                    fontsize=font_size * height)
        ax.annotate(p_stars,
                    xy=(0.65, 0.6),
                    xycoords=ax.transAxes,
                    color='red',
                    fontsize=17 * height)

    g = sns.PairGrid(table, vars=vars, height=height)
    g.map_diag(sns.distplot)
    if method == 'pearson':
        g.map_lower(sns.regplot, scatter_kws=scatter_kws)
    else:
        g.map_lower(sns.regplot, lowess=True, scatter_kws=scatter_kws)
    g.map_upper(corr, method=method)

    fig_corr = plt2MD(plt)
    plt.clf()

    rb = BrtcReprBuilder()
    rb.addMD(
        strip_margin(""" ## Correlation Results
        | ### Correlation Matrix
        | {fig_corr}
        |
        | ### Correlation Table
        | {table}
        """.format(fig_corr=fig_corr, table=pandasDF2MD(df_result))))

    params = {'vars': vars, 'method': method, 'height': height}

    res = dict()
    res['params'] = params
    res['corr_table'] = df_result
    res['_repr_brtc_'] = rb.get()

    return {'result': res}
Ejemplo n.º 19
0
def _kmeans_train_predict(table,
                          input_cols,
                          n_clusters=3,
                          prediction_col='prediction',
                          init='k-means++',
                          n_init=10,
                          max_iter=300,
                          tol=1e-4,
                          precompute_distances='auto',
                          seed=None,
                          n_jobs=1,
                          algorithm='auto',
                          n_samples=None):
    inputarr = table[input_cols]
    if n_samples is None:
        n_samples = len(inputarr)

    validate(greater_than_or_equal_to(n_clusters, 1, 'n_clusters'),
             greater_than_or_equal_to(n_init, 1, 'n_init'),
             greater_than_or_equal_to(max_iter, 1, 'max_iter'),
             greater_than(tol, 0.0, 'tol'),
             greater_than_or_equal_to(n_jobs, 1, 'n_jobs'),
             greater_than_or_equal_to(n_samples, 0, 'n_samples'))

    k_means = SKKMeans(n_clusters=n_clusters,
                       init=init,
                       n_init=n_init,
                       max_iter=max_iter,
                       tol=tol,
                       precompute_distances=precompute_distances,
                       verbose=0,
                       random_state=seed,
                       copy_x=True,
                       n_jobs=n_jobs,
                       algorithm=algorithm)

    k_means.fit(inputarr)

    params = {
        'input_cols': input_cols,
        'n_clusters': n_clusters,
        'init': init,
        'n_init': n_init,
        'max_iter': max_iter,
        'tol': tol,
        'precompute_distances': precompute_distances,
        'seed': seed,
        'n_jobs': n_jobs,
        'algorithm': algorithm
    }

    cluster_centers = k_means.cluster_centers_
    labels = k_means.labels_

    pca2_model = PCA(n_components=2).fit(inputarr)
    pca2 = pca2_model.transform(inputarr)

    fig_centers = _kmeans_centers_plot(input_cols, cluster_centers)
    fig_samples = _kmeans_samples_plot(table, input_cols, n_samples,
                                       cluster_centers)
    fig_pca = _kmeans_pca_plot(labels, cluster_centers, pca2_model, pca2)

    rb = BrtcReprBuilder()
    rb.addMD(
        strip_margin("""
    | ## Kmeans Result
    | - Number of iterations run: {n_iter_}.
    | - Coordinates of cluster centers
    | {fig_cluster_centers} 
    | - Samples
    | {fig_pca}
    | {fig_samples}
    |
    | ### Parameters
    | {params}
    """.format(n_iter_=k_means.n_iter_,
               fig_cluster_centers=fig_centers,
               fig_pca=fig_pca,
               fig_samples=fig_samples,
               params=dict2MD(params))))

    model = _model_dict('kmeans')
    model['model'] = k_means
    model['input_cols'] = input_cols
    model['_repr_brtc_'] = rb.get()

    out_table = table.copy()
    out_table[prediction_col] = labels
    return {'out_table': out_table, 'model': model}