Ejemplo n.º 1
0
def agglomerative_clustering_train_predict(input_table,
                                           input_cols,
                                           n_clusters=3,
                                           affinity='euclidean',
                                           compute_full_tree=True,
                                           linkage='ward',
                                           prediction_col='prediction',
                                           figw=6.4,
                                           figh=4.8):
    inputarr = input_table[input_cols]

    agglomerative_clustering = SKAgglomerativeClustering(
        n_clusters=n_clusters,
        affinity=affinity,
        memory=None,
        connectivity=None,
        compute_full_tree=compute_full_tree,
        linkage=linkage)
    agglomerative_clustering.fit(inputarr)
    input_table[prediction_col] = agglomerative_clustering.labels_

    children = agglomerative_clustering.children_
    distance = np.arange(children.shape[0])
    no_of_observations = np.arange(2, children.shape[0] + 2)
    linkage_matrix = np.column_stack([children, distance,
                                      no_of_observations]).astype(float)
    plt.figure(figsize=(figw, figh))
    dendrogram(linkage_matrix)
    plot_dendrogram = plt2MD(plt)
    plt.clf()

    rb = BrtcReprBuilder()
    rb.addMD(
        strip_margin("""
    | ## Agglomerative Clustering Result
    | {plot_dendrogram}
    """.format(plot_dendrogram=plot_dendrogram)))

    agglomerative_clustering_result = {
        'model': agglomerative_clustering,
        'input_cols': input_cols,
        '_repr_brtc_': rb.get()
    }

    return {
        'out_table': input_table,
        'agglomerative_result': agglomerative_clustering_result
    }
Ejemplo n.º 2
0
def _ancova(table, response_cols, factor_col, between_col):
    rb = BrtcReprBuilder()
    rb.addMD(strip_margin("""
    ## Analysis of Covariance Result
    """))
    groups = table[between_col].unique()
    groups.sort()
    sum_len = np.sum([len(str(group)) for group in groups])

    result = dict()
    result['_grouped_data'] = dict()

    for response_col in response_cols:
        data = table[response_col]
        result['_grouped_data'][response_col] = dict()

        ax = sns.boxplot(x=between_col,
                         y=response_col,
                         data=table,
                         order=groups)
        if sum_len > 512:
            ax.set_xticklabels(ax.get_xticklabels(), rotation=90)
        elif sum_len > 64:
            ax.set_xticklabels(ax.get_xticklabels(), rotation=45)

        fig_box = plt2MD(plt)
        plt.clf()

        ancova_res = pg_ancova(data=table,
                               dv=response_col,
                               covar=factor_col,
                               between=between_col)
        ancova_df = pandasDF2MD(ancova_res)

        rb.addMD(
            strip_margin("""
        | ## {response_col} by {between_col}
        | {fig_box}
        |
        | ### ANCOVA
        | {ancova_df}
        """.format(response_col=response_col,
                   between_col=between_col,
                   fig_box=fig_box,
                   ancova_df=ancova_df)))

    result['_repr_brtc_'] = rb.get()
    return {'result': result}
Ejemplo n.º 3
0
def _kmeans_centers_plot(input_cols, cluster_centers):
    sum_len_cols = np.sum([len(col) for col in input_cols])
    x = range(len(input_cols))
    if sum_len_cols >= 512:
        plt.xticks(x, input_cols, rotation='vertical')
    elif sum_len_cols >= 64:
        plt.xticks(x, input_cols, rotation=45, ha='right')
    else:
        plt.xticks(x, input_cols)
    for idx, centers in enumerate(cluster_centers):
        plt.plot(x, centers, "o-", label=idx)
    plt.legend()
    plt.tight_layout()
    fig_centers = plt2MD(plt)
    plt.clf()
    return fig_centers
Ejemplo n.º 4
0
def _plot_feature_importances(feature_cols, regressor):
    
    feature_importance = regressor.feature_importances_
    indices = np.argsort(feature_importance)
    sorted_feature_cols = np.array(feature_cols)[indices]
    
    plt.barh(range(len(indices)), feature_importance[indices], color='b', align='center')
    for i, v in enumerate(feature_importance[indices]):
        plt.text(v, i, " {:.2f}".format(v), color='b', va='center', fontweight='bold')
    
    plt.yticks(range(len(indices)), sorted_feature_cols)
    plt.xlabel("Feature importance")
    plt.ylabel("Feature")
    plt.tight_layout()
    fig_feature_importances = plt2MD(plt)
    plt.close()
    return fig_feature_importances
Ejemplo n.º 5
0
def _mean_shift_pca_plot(labels, cluster_centers, pca2_model, pca2, colors):
    for i, color in zip(range(len(cluster_centers)), colors):
        plt.scatter(pca2[:, 0][labels == i],
                    pca2[:, 1][labels == i],
                    color=color)

    pca2_centers = pca2_model.transform(cluster_centers)
    plt.scatter(pca2_centers[:, 0],
                pca2_centers[:, 1],
                marker='x',
                edgecolors=1,
                s=100,
                color='red')
    plt.tight_layout()
    fig_pca = plt2MD(plt)
    plt.clf()
    return fig_pca
Ejemplo n.º 6
0
def _kmeans_samples_plot(table, input_cols, n_samples, cluster_centers):
    sum_len_cols = np.sum([len(col) for col in input_cols])
    sample = table[input_cols].sample(n=n_samples)
    x = range(len(input_cols))
    if sum_len_cols >= 512:
        plt.xticks(x, input_cols, rotation='vertical')
    elif sum_len_cols >= 64:
        plt.xticks(x, input_cols, rotation=45, ha='right')
    else:
        plt.xticks(x, input_cols)
    for idx in sample.index:
        plt.plot(x, sample.transpose()[idx], color='grey', linewidth=1)
    for idx, centers in enumerate(cluster_centers):
        plt.plot(x, centers, "o-", label=idx, linewidth=4)
    plt.tight_layout()
    fig_samples = plt2MD(plt)
    plt.clf()
    return fig_samples
Ejemplo n.º 7
0
def _screeplot(explained_variance, explained_variance_ratio, n_components, ax=None):
    if ax is None:
        ax = plt.gca()
    
    n_components_range = range(1, len(explained_variance) + 1)
    cum_explained_variance = explained_variance_ratio.cumsum()
    plt.xticks(n_components_range, n_components_range)
    ax.plot(n_components_range, explained_variance, 'o--')
    ax.set_ylabel('Explained Variance')
    
    ax2 = ax.twinx()
    ax2.plot(n_components_range, cum_explained_variance, 'x-')
    ax2.set_ylim([0, 1.05])
    ax2.set_ylabel('Cumulative Explained Variance Ratio')
    ax2.text(n_components, cum_explained_variance[n_components - 1] - 0.05, '%0.4f' % cum_explained_variance[n_components - 1], va='center', ha='center')
    fig_scree = plt2MD(plt)
    plt.clf()
    return fig_scree
Ejemplo n.º 8
0
def _kmeans_samples_plot(table, input_cols, n_samples, cluster_centers, seed, colors):
    feature_names, inputarr = check_col_type(table, input_cols)
    sum_len_cols = np.sum([len(col) for col in feature_names])
    sample = pd.DataFrame(inputarr).sample(n=n_samples, random_state=seed)
    x = range(len(feature_names))
    if sum_len_cols >= 512:
        plt.xticks(x, feature_names, rotation='vertical')
    elif sum_len_cols >= 64:
        plt.xticks(x, feature_names, rotation=45, ha='right')
    else:
        plt.xticks(x, feature_names)
    for idx in sample.index:
        plt.plot(x, sample.transpose()[idx], color='grey', linewidth=1)
    for idx, centers in enumerate(cluster_centers):
        plt.plot(x, centers, "o-", label=idx, linewidth=2, color=colors[idx])
    plt.tight_layout()
    fig_samples = plt2MD(plt)
    plt.clf()
    return fig_samples
Ejemplo n.º 9
0
def _kmeans_pca_plot(labels, cluster_centers, pca2_model, pca2):
    n_clusters = len(cluster_centers)
    colors = cm.nipy_spectral(np.arange(n_clusters).astype(float) / n_clusters)

    for i, color in zip(range(n_clusters), colors):
        plt.scatter(pca2[:, 0][labels == i],
                    pca2[:, 1][labels == i],
                    color=color)

    pca2_centers = pca2_model.transform(cluster_centers)
    plt.scatter(pca2_centers[:, 0],
                pca2_centers[:, 1],
                marker='x',
                edgecolors=1,
                s=200,
                color=colors)
    plt.tight_layout()
    fig_pca = plt2MD(plt)
    plt.clf()
    return fig_pca
Ejemplo n.º 10
0
def _spectral_clustering_samples_plot(labels, table, input_cols, n_samples,
                                      n_clusters, colors):
    sum_len_cols = np.sum([len(col) for col in input_cols])
    sample = table[input_cols].sample(
        n=n_samples) if n_samples is not None else table[input_cols]
    x = range(len(input_cols))
    if sum_len_cols >= 512:
        plt.xticks(x, input_cols, rotation='vertical')
    elif sum_len_cols >= 64:
        plt.xticks(x, input_cols, rotation=45, ha='right')
    else:
        plt.xticks(x, input_cols)
    for idx in sample.index:
        plt.plot(x,
                 sample.transpose()[idx],
                 color=colors[labels[idx]],
                 linewidth=1)
    plt.tight_layout()
    fig_samples = plt2MD(plt)
    plt.clf()
    return fig_samples
Ejemplo n.º 11
0
def _kmeans_pca_plot(labels, cluster_centers, pca2_model, pca2, colors):
    n_clusters = len(cluster_centers)

    pca2_centers = pca2_model.transform(cluster_centers)
    
    if pca2.shape[1] == 1:
        for i, color in zip(range(n_clusters), colors):
            plt.scatter(pca2[:, 0][labels == i], pca2[:, 0][labels == i], color=color)
        plt.scatter(pca2_centers[:, 0], pca2_centers[:, 0], marker='x', edgecolors=1, s=200, color=colors)
        plt.xlabel("Feature space for the 1st feature")
        plt.ylabel("Feature space for the 1st feature")
    else:
        for i, color in zip(range(n_clusters), colors):
            plt.scatter(pca2[:, 0][labels == i], pca2[:, 1][labels == i], color=color)
        plt.scatter(pca2_centers[:, 0], pca2_centers[:, 1], marker='x', edgecolors=1, s=200, color=colors)
        plt.xlabel("Feature space for the 1st feature")
        plt.ylabel("Feature space for the 2nd feature")
            
    plt.tight_layout()
    fig_pca = plt2MD(plt)
    plt.clf()
    return fig_pca
Ejemplo n.º 12
0
def _timeseries_decomposition(table,
                              input_col,
                              frequency,
                              model_type='additive',
                              filteration=None,
                              two_sided=True,
                              extrapolate_trend=0):
    out_table = table.copy()
    decomposition = sm.tsa.seasonal_decompose(
        out_table[input_col],
        model=model_type,
        filt=filteration,
        freq=frequency,
        two_sided=two_sided,
        extrapolate_trend=extrapolate_trend)
    decomposition.plot()
    plt2 = plt2MD(plt)
    plt.clf()

    out_table['trend'] = decomposition.trend
    out_table['seasonal'] = decomposition.seasonal
    out_table['residual'] = decomposition.resid

    rb = BrtcReprBuilder()
    rb.addMD(
        strip_margin("""
    | ## Time Series Decomposition Result
    | Model Type : {model_type}
    |
    | {image2}
    |
    """.format(model_type=model_type, image2=plt2)))

    model = _model_dict('timeseries_decomposition')
    model['model_type'] = model_type
    model['_repr_brtc_'] = rb.get()

    return {'out_table': out_table, 'model': model}
Ejemplo n.º 13
0
def _agglomerative_clustering_samples_plot(labels, table, input_cols,
                                           n_samples, n_clusters, colors):
    sample = table[input_cols].sample(
        n=n_samples) if n_samples is not None else table[input_cols]
    feature_names, sample = check_col_type(sample, input_cols)
    sum_len_cols = np.sum([len(col) for col in feature_names])
    x = range(len(feature_names))
    if sum_len_cols >= 512:
        plt.xticks(x, feature_names, rotation='vertical')
    elif sum_len_cols >= 64:
        plt.xticks(x, feature_names, rotation=45, ha='right')
    else:
        plt.xticks(x, feature_names)
    if feature_names == input_cols:
        for idx in sample.index:
            plt.plot(x, sample.transpose()[idx], color='grey', linewidth=1)
    else:
        for idx in range(len(sample)):
            plt.plot(x, sample[idx], color='grey', linewidth=1)
    plt.tight_layout()
    fig_samples = plt2MD(plt)
    plt.clf()
    return fig_samples
Ejemplo n.º 14
0
def _isotonic_regression_train(table, feature_col, label_col, increasing=True):
    if feature_col == label_col:
        raise BFE.from_errors([{
            '0100':
            '{} is deplicate in Feature column and Label column'.format(
                feature_col)
        }])
    features = table[feature_col]
    label = table[label_col]
    isotonic_model = IsotonicRegression(increasing=increasing)
    isotonic_model.fit(features, label)
    predict = isotonic_model.predict(features)

    plt.figure()
    plt.plot(label, 'r.-')
    plt.plot(predict, 'b.-')
    plt.xlabel('Samples')
    plt.legend(['True label', 'Predicted'])
    fig_actual_predict = plt2MD(plt)
    get_param = isotonic_model.get_params()

    rb = BrtcReprBuilder()
    rb.addMD(
        strip_margin("""
    | ## Linear Regression Result
    | ### Param
    | {param}
    | ### Predicted vs Actual
    | {image1}
    """.format(image1=fig_actual_predict, param=get_param)))
    model = _model_dict('isotonic_regression_model')
    model['_repr_brtc_'] = rb.get()
    model['feature_col'] = feature_col
    model['label_col'] = label_col
    model['parameters'] = get_param
    model['regressor'] = isotonic_model
    return {"model": model}
Ejemplo n.º 15
0
def _holt_winters_predict(model, prediction_num):

    rb = BrtcReprBuilder()

    df1 = pd.DataFrame()
    df2 = pd.DataFrame()
    df1['number'] = np.arange(1, prediction_num + 1, 1)
    for column in model['input_columns']:
        df2[column] = model['hw_' + str(column)].forecast(prediction_num)
    reindex_df2 = df2.reset_index(drop=True)
    predict_table = df1.join(reindex_df2)

    rb.addMD(
        strip_margin("""
        |## Holt-Winters Predict Result
        |
        """.format()))

    for column in model['input_columns']:
        plt.title(column)
        plt.plot(model['origin_table'][column].index,
                 model['origin_table'][column],
                 label='Train')
        plt.plot(df2[column].index, df2[column], label='Prediction')
        plt.legend(loc='best')
        rb.addMD(
            strip_margin("""
        |{plot}
        |
        """.format(plot=plt2MD(plt))))
        plt.clf()

    model['_repr_brtc_'] = rb.get()
    model['predict_table'] = predict_table

    return {'model': model, 'out_table': predict_table}
Ejemplo n.º 16
0
def _association_rule_visualization(table,
                                    option='multiple_to_single',
                                    edge_length_scaling=1,
                                    font_size=10,
                                    node_size_scaling=1,
                                    figure_size_muliplier=1,
                                    display_rule_num=False):

    if (option == 'single_to_single'):
        result_network = table.copy()

        length_ante = []
        string_ante = []
        length_conse = []
        string_conse = []
        for row in result_network['antecedent']:
            length_ante += [len(row)]
            string_ante += [row[0]]
        for row in result_network['consequent']:
            length_conse += [len(row)]
            string_conse += [row[0]]
        result_network['length_ante'] = length_ante
        result_network['string_ante'] = string_ante
        result_network['length_conse'] = length_conse
        result_network['string_conse'] = string_conse
        result_network = result_network[result_network.length_ante == 1]
        result_network = result_network[result_network.length_conse == 1]
        result_network['support_ante'] = result_network[
            'support'] / result_network['confidence']
        result_network['support_conse'] = result_network[
            'confidence'] / result_network['lift']
        #edges_colors = preprocessing.LabelEncoder()
        #edges_colors.fit(result_network['lift'])

        #edges_colors = edges_colors.transform(result_network['lift'])
        #result_network['edge_colors'] = edges_colors

        result_network = result_network.reset_index()
        edges = []
        for i in range(len(result_network.string_ante)):
            edges += [(result_network.string_ante[i],
                       result_network.string_conse[i])]

        G = nx.DiGraph()
        G.add_edges_from(edges)
        nodes = G.nodes()
        plt.figure(figsize=(4 * len(nodes)**0.5 * figure_size_muliplier,
                            4 * len(nodes)**0.5 * figure_size_muliplier))
        pos = nx.spring_layout(G, k=0.4 * edge_length_scaling)

        node_tmp = list(result_network.string_ante) + list(
            result_network.string_conse)
        support_tmp = list(result_network.support_ante) + list(
            result_network.support_conse)
        tmp_node_support = []
        for i in range(len(node_tmp)):
            tmp_node_support += [[node_tmp[i], support_tmp[i]]]
        nodes_table = pd.DataFrame.from_records(tmp_node_support,
                                                columns=['name', 'support'])
        nodes_table = nodes_table.drop_duplicates(['name'])
        node_color = []
        nodes_table = nodes_table.reset_index()
        scaled_support = _scaling(nodes_table.support)
        for node in nodes:
            for i in range(len(nodes_table.name)):
                if nodes_table.name[i] == node:
                    node_color += [
                        scaled_support[i] * 2500 * node_size_scaling
                    ]
                    break
        #if(scaling==True):
    #     edge_color = [result_network['edge_colors'][n] for n in range(len(result_network['length_conse']))]
    #else:
        scaled_support = _scaling(result_network['confidence'])
        edge_size = [
            scaled_support[n] * 8
            for n in range(len(result_network['length_conse']))
        ]
        edge_color = [
            result_network['lift'][n]
            for n in range(len(result_network['length_conse']))
        ]
        nx.draw(G,
                pos,
                node_color=node_color,
                edge_color=edge_color,
                node_size=node_color,
                arrowsize=20 * (0.2 + 0.8 * node_size_scaling),
                font_family='NanumGothic',
                with_labels=True,
                cmap=plt.cm.Blues,
                edge_cmap=plt.cm.Reds,
                arrows=True,
                edge_size=edge_color,
                width=edge_size,
                font_size=font_size)

        fig_digraph = plt2MD(plt)

        graph_min_support = np.min(nodes_table.support)
        graph_max_support = np.max(nodes_table.support)
        graph_min_confidence = np.min(result_network['confidence'])
        graph_max_confidence = np.max(result_network['confidence'])
        graph_min_lift = np.min(result_network['lift'])
        graph_max_lift = np.max(result_network['lift'])

        rb = BrtcReprBuilder()
        rb.addMD(
            strip_margin("""
        | ### Network Digraph
        | ##### Node color, size : support ({graph_min_support}~{graph_max_support})
        | ##### Edge color : lift ({graph_min_lift}~{graph_max_lift})
        | ##### Edge size : confidence ({graph_min_confidence}~{graph_max_confidence})
        | {image1}
        |
        """.format(image1=fig_digraph,
                   graph_min_support=graph_min_support,
                   graph_max_support=graph_max_support,
                   graph_min_lift=graph_min_lift,
                   graph_max_lift=graph_max_lift,
                   graph_min_confidence=graph_min_confidence,
                   graph_max_confidence=graph_max_confidence)))

    elif (option == 'multiple_to_single'):

        result_network = table.copy()
        length_ante = []
        string_ante = []
        length_conse = []
        string_conse = []
        for row in result_network['consequent']:
            length_conse += [len(row)]
            string_conse += [row[0]]
        result_network['length_conse'] = length_conse
        result_network['consequent'] = string_conse
        result_network = result_network[result_network.length_conse == 1]
        index_list = result_network.index.tolist()
        rownum = []
        for i in range(len(result_network['consequent'])):
            if display_rule_num:
                rownum += ['R%d' % (i + 1)]
            else:
                rownum += [_n_blank_strings(i + 1)]
        result_network['row_number'] = rownum
        edges = []
        nodes = []
        for i in index_list:
            for j in range(len(result_network.antecedent[i])):
                edges += [(result_network.antecedent[i][j],
                           result_network['row_number'][i])]
            edges += [(result_network['row_number'][i],
                       result_network.consequent[i])]
            nodes += [result_network['row_number'][i]]

        G = nx.DiGraph()
        G.add_nodes_from(nodes)
        G.add_edges_from(edges)
        plt.figure(figsize=(2 * len(nodes)**0.5 * figure_size_muliplier,
                            2 * len(nodes)**0.5 * figure_size_muliplier))
        pos = nx.spring_layout(G, k=0.2 * edge_length_scaling)
        nodes_color = []
        nodes_size = []
        scaled_lift = _scaling(result_network.lift)
        for node in range(len(G.nodes())):
            if node < len(nodes):
                nodes_color += [result_network.support[index_list[node]]]
                nodes_size += [scaled_lift[node] * 2000 * node_size_scaling]
            else:
                nodes_color += [0]
                nodes_size += [0]

        nx.draw(G,
                pos,
                node_color=nodes_color,
                node_size=nodes_size,
                font_family='NanumGothic',
                with_labels=True,
                cmap=plt.cm.Reds,
                arrows=True,
                edge_color='Grey',
                font_weight='bold',
                arrowsize=20 * (0.2 + 0.8 * node_size_scaling),
                font_size=font_size)
        fig_digraph = plt2MD(plt)

        graph_min_support = np.min(result_network.support)
        graph_max_support = np.max(result_network.support)
        graph_min_lift = np.min(result_network.lift)
        graph_max_lift = np.max(result_network.lift)

        rb = BrtcReprBuilder()
        rb.addMD(
            strip_margin("""
        | ### Network Digraph
        | ##### Size of circle : support ({graph_min_support}~{graph_max_support})
        | ##### Color of circle : lift ({graph_min_lift}~{graph_max_lift})
        | {image1}
        |
        """.format(image1=fig_digraph,
                   graph_min_support=graph_min_support,
                   graph_max_support=graph_max_support,
                   graph_min_lift=graph_min_lift,
                   graph_max_lift=graph_max_lift)))

    else:

        result_network = table.copy()
        length_ante = []
        string_ante = []
        length_conse = []
        string_conse = []
        for row in result_network['consequent']:
            length_conse += [len(row)]
        result_network['length_conse'] = length_conse
        result_network = result_network.reset_index()
        rownum = []
        for i in range(len(result_network['consequent'])):
            if display_rule_num:
                rownum += ['R%d' % i]
            else:
                rownum += [_n_blank_strings(i + 1)]
        result_network['row_number'] = rownum
        edges = []
        nodes = []
        for i in range(len(result_network.consequent)):
            for j in range(len(result_network.antecedent[i])):
                edges += [(result_network.antecedent[i][j],
                           result_network['row_number'][i])]
            for j in range(len(result_network.consequent[i])):
                edges += [(result_network['row_number'][i],
                           result_network.consequent[i][j])]
            nodes += [result_network['row_number'][i]]

        G = nx.DiGraph()
        G.add_nodes_from(nodes)
        G.add_edges_from(edges)
        plt.figure(figsize=(2 * len(nodes)**0.5 * figure_size_muliplier,
                            2 * len(nodes)**0.5 * figure_size_muliplier))
        pos = nx.spring_layout(G, k=0.2 * edge_length_scaling)
        nodes_color = []
        nodes_size = []
        scaled_lift = _scaling(result_network.lift)
        for node in range(len(G.nodes())):
            if node < len(nodes):
                nodes_color += [result_network.support[node]]
                nodes_size += [scaled_lift[node] * 2000 * node_size_scaling]
            else:
                nodes_color += [0]
                nodes_size += [0]

        nx.draw(G,
                pos,
                node_color=nodes_color,
                node_size=nodes_size,
                font_family='NanumGothic',
                with_labels=True,
                cmap=plt.cm.Reds,
                arrows=True,
                edge_color='Grey',
                font_weight='bold',
                arrowsize=20 * (0.2 + 0.8 * node_size_scaling),
                font_size=font_size)
        fig_digraph = plt2MD(plt)

        graph_min_support = np.min(result_network.support)
        graph_max_support = np.max(result_network.support)
        graph_min_lift = np.min(result_network.lift)
        graph_max_lift = np.max(result_network.lift)

        rb = BrtcReprBuilder()
        rb.addMD(
            strip_margin("""
        | ### Network Digraph
        | ##### Size of circle : support ({graph_min_support}~{graph_max_support})
        | ##### Color of circle : lift ({graph_min_lift}~{graph_max_lift})
        | {image1}
        |
        """.format(image1=fig_digraph,
                   graph_min_support=graph_min_support,
                   graph_max_support=graph_max_support,
                   graph_min_lift=graph_min_lift,
                   graph_max_lift=graph_max_lift)))

    model = _model_dict('Association rule')
    model['_repr_brtc_'] = rb.get()

    return {'model': model}
Ejemplo n.º 17
0
def _autocorrelation(table, input_col, nlags=20, conf_level=0.95):
    data = table[input_col]

    plt.figure()
    plot_acf(data, lags=nlags, alpha=1 - conf_level)
    fig_plt_acf = plt2MD(plt)
    plt.clf()

    plt.figure()
    plot_pacf(data, lags=nlags, alpha=1 - conf_level)
    fig_plt_pacf = plt2MD(plt)
    plt.clf()

    acf_ret = acf(data, nlags=nlags, alpha=1 - conf_level)
    pacf_ret = pacf(data, nlags=nlags, alpha=1 - conf_level)

    result_table1 = pd.DataFrame([])
    result_table1['lag'] = list(range(nlags + 1))
    result_table1['ACF'] = acf_ret[0]

    if conf_level is not None:
        result_table1['%g%% confidence Interval' % (conf_level * 100)] = [
            str((acf_ret[1][i][0], acf_ret[1][i][1])) for i in range(nlags + 1)
        ]

    result_table2 = pd.DataFrame([])
    result_table2['lag'] = list(range(nlags + 1))
    result_table2['PACF'] = pacf_ret[0]

    if conf_level is not None:
        result_table2['%g%% confidence Interval' % (conf_level * 100)] = [
            str((pacf_ret[1][i][0], pacf_ret[1][i][1]))
            for i in range(nlags + 1)
        ]

    rb = BrtcReprBuilder()
    rb.addMD(
        strip_margin("""# Autocorrelation / Partial Autocorrelation Result"""))
    rb.addMD(
        strip_margin("""
    |## Autocorrelation
    |
    |{image1}
    |
    |### Autocorrelation Table
    |
    |{result_table1}
    |
    |## Partial Autocorrelation
    |
    |{image2}
    |
    |### Partial Autocorrelation Table
    |
    |{result_table2}
    |
    """.format(image1=fig_plt_acf,
               result_table1=pandasDF2MD(result_table1, num_rows=nlags + 1),
               image2=fig_plt_pacf,
               result_table2=pandasDF2MD(result_table2, num_rows=nlags + 1))))

    model = _model_dict('autocorrelation')
    model['autocorrelation_table'] = result_table1
    model['partial_autocorrelation_table'] = result_table2
    model['_repr_brtc_'] = rb.get()

    return {'model': model}
Ejemplo n.º 18
0
def _decision_tree_regression_train(
        table,
        feature_cols,
        label_col,  # fig_size=np.array([6.4, 4.8]), 
        criterion='mse',
        splitter='best',
        max_depth=None,
        min_samples_split=2,
        min_samples_leaf=1,
        min_weight_fraction_leaf=0.0,
        max_features=None,
        random_state=None,
        max_leaf_nodes=None,
        min_impurity_decrease=0.0,
        min_impurity_split=None,
        presort=False,
        sample_weight=None,
        check_input=True,
        X_idx_sorted=None):

    param_validation_check = [
        greater_than_or_equal_to(min_samples_split, 2, 'min_samples_split'),
        greater_than_or_equal_to(min_samples_leaf, 1, 'min_samples_leaf'),
        greater_than_or_equal_to(min_weight_fraction_leaf, 0.0,
                                 'min_weight_fraction_leaf')
    ]
    if max_depth is not None:
        param_validation_check.append(
            greater_than_or_equal_to(max_depth, 1, 'max_depth'))

    validate(*param_validation_check)

    regressor = DecisionTreeRegressor(criterion, splitter, max_depth,
                                      min_samples_split, min_samples_leaf,
                                      min_weight_fraction_leaf, max_features,
                                      random_state, max_leaf_nodes,
                                      min_impurity_decrease,
                                      min_impurity_split, presort)
    regressor.fit(table[feature_cols], table[label_col], sample_weight,
                  check_input, X_idx_sorted)

    try:
        from sklearn.externals.six import StringIO
        from sklearn.tree import export_graphviz
        import pydotplus
        dot_data = StringIO()
        export_graphviz(regressor,
                        out_file=dot_data,
                        feature_names=feature_cols,
                        filled=True,
                        rounded=True,
                        special_characters=True)
        graph = pydotplus.graph_from_dot_data(dot_data.getvalue())

        from brightics.common.repr import png2MD
        fig_tree = png2MD(graph.create_png())
    except:
        fig_tree = "Graphviz is needed to draw a Decision Tree graph. Please download it from http://graphviz.org/download/ and install it to your computer."

    # json
    model = _model_dict('decision_tree_regression_model')
    model['feature_cols'] = feature_cols
    model['label_col'] = label_col
    feature_importance = regressor.feature_importances_
    model['feature_importance'] = feature_importance
    model['max_features'] = regressor.max_features_
    model['n_features'] = regressor.n_features_
    model['n_outputs'] = regressor.n_outputs_
    model['tree'] = regressor.tree_
    get_param = regressor.get_params()
    model['parameters'] = get_param
    model['regressor'] = regressor

    # report

    indices = np.argsort(feature_importance)
    sorted_feature_cols = np.array(feature_cols)[indices]

    plt.title('Feature Importances')
    plt.barh(range(len(indices)),
             feature_importance[indices],
             color='b',
             align='center')
    for i, v in enumerate(feature_importance[indices]):
        plt.text(v,
                 i,
                 " {:.2f}".format(v),
                 color='b',
                 va='center',
                 fontweight='bold')
    plt.yticks(range(len(indices)), sorted_feature_cols)
    plt.xlabel('Relative Importance')
    plt.tight_layout()
    fig_feature_importances = plt2MD(plt)
    plt.clf()

    params = dict2MD(get_param)
    feature_importance_df = pd.DataFrame(data=feature_importance,
                                         index=feature_cols).T

    # Add tree plot

    rb = BrtcReprBuilder()
    rb.addMD(
        strip_margin("""
    | ## Decision Tree Regression Train Result
    | ### Decision Tree
    | {fig_tree}
    |
    | ### Feature Importance
    | {fig_feature_importances}
    |
    | ### Parameters
    | {list_parameters}
    |
    """.format(fig_tree=fig_tree,
               fig_feature_importances=fig_feature_importances,
               list_parameters=params)))
    model['_repr_brtc_'] = rb.get()

    return {'model': model}
Ejemplo n.º 19
0
def _penalized_linear_regression_train(table,
                                       feature_cols,
                                       label_col,
                                       regression_type='ridge',
                                       alpha=1.0,
                                       l1_ratio=0.5,
                                       fit_intercept=True,
                                       max_iter=1000,
                                       tol=0.0001,
                                       random_state=None):
    out_table = table.copy()
    feature_names, features = check_col_type(out_table, feature_cols)
    label = out_table[label_col]
    if regression_type == 'ridge':
        regression_model = Ridge(alpha=alpha,
                                 fit_intercept=fit_intercept,
                                 max_iter=None,
                                 tol=tol,
                                 solver='auto',
                                 random_state=random_state)
    elif regression_type == 'lasso':
        regression_model = Lasso(alpha=alpha,
                                 fit_intercept=fit_intercept,
                                 max_iter=max_iter,
                                 tol=tol,
                                 random_state=random_state,
                                 selection='random')
    elif regression_type == 'elastic_net':
        regression_model = ElasticNet(alpha=alpha,
                                      l1_ratio=l1_ratio,
                                      fit_intercept=fit_intercept,
                                      max_iter=max_iter,
                                      tol=tol,
                                      random_state=random_state,
                                      selection='random')
    else:
        raise_runtime_error("Please check 'regression_type'.")

    regression_model.fit(features, label)

    out_table1 = pd.DataFrame([])
    out_table1['x_variable_name'] = [variable for variable in feature_names]
    out_table1['coefficient'] = regression_model.fit(features, label).coef_
    intercept = pd.DataFrame(
        [['intercept',
          regression_model.fit(features, label).intercept_]],
        columns=['x_variable_name', 'coefficient'])
    if fit_intercept == True:
        out_table1 = out_table1.append(intercept, ignore_index=True)

    predict = regression_model.predict(features)
    residual = label - predict

    out_table['predict'] = predict
    out_table['residual'] = residual

    if regression_type == 'elastic_net':
        params = {
            'Feature Columns': feature_names,
            'Label Column': label_col,
            'Regression Type': regression_type,
            'Regularization (Penalty Weight)': alpha,
            'L1 Ratio': l1_ratio,
            'Fit Intercept': fit_intercept,
            'Maximum Number of Iterations': max_iter,
            'Tolerance': tol
        }
    else:
        params = {
            'Feature Columns': feature_names,
            'Label Column': label_col,
            'Regression Type': regression_type,
            'Regularization (Penalty Weight)': alpha,
            'Fit Intercept': fit_intercept,
            'Maxium Number of Iterations': max_iter,
            'Tolerance': tol
        }

    score = {
        'MSE': mean_squared_error(label, predict),
        'R2': r2_score(label, predict)
    }

    plt.figure()
    plt.scatter(predict, label)
    plt.xlabel('Predicted values for ' + label_col)
    plt.ylabel('Actual values for ' + label_col)
    x = predict
    p1x = np.min(x)
    p2x = np.max(x)
    plt.plot([p1x, p2x], [p1x, p2x], 'r--')
    fig_actual_predict = plt2MD(plt)
    plt.clf()

    plt.figure()
    plt.scatter(predict, residual)
    plt.xlabel('Predicted values for ' + label_col)
    plt.ylabel('Residuals')
    plt.axhline(y=0, color='r', linestyle='--')
    fig_residual_1 = plt2MD(plt)
    plt.clf()

    plt.figure()
    sm.qqplot(residual, line='s')
    plt.ylabel('Residuals')
    fig_residual_2 = plt2MD(plt)
    plt.clf()

    plt.figure()
    sns.distplot(residual)
    plt.xlabel('Residuals')
    fig_residual_3 = plt2MD(plt)
    plt.clf()

    # checking the magnitude of coefficients

    plt.figure()
    predictors = feature_names
    coef = Series(regression_model.coef_, predictors).sort_values()
    coef.plot(kind='bar', title='Model Coefficients')
    plt.tight_layout()
    fig_model_coefficients = plt2MD(plt)
    plt.clf()

    rb = BrtcReprBuilder()
    rb.addMD(
        strip_margin("""
    | # Penalized Linear Regression Result
    | ### Selected Parameters: 
    | {params}
    |
    | ## Results
    | ### Model Parameters
    | {out_table1}
    |
    | ### Regression Score
    | {score}
    |
    """.format(params=dict2MD(params),
               out_table1=pandasDF2MD(out_table1),
               score=dict2MD(score))))
    rb.addMD(
        strip_margin("""
    |
    | ### Predicted vs Actual
    | {image1}
    |
    | ### Fit Diagnostics
    | {image2}
    | {image3}
    | {image4}
    |
    | ### Magnitude of Coefficients
    | {image5}
    |
    """.format(image1=fig_actual_predict,
               image2=fig_residual_1,
               image3=fig_residual_2,
               image4=fig_residual_3,
               image5=fig_model_coefficients)))

    model = _model_dict('penalized_linear_regression_model')
    model['feature_cols'] = feature_cols
    model['label_col'] = label_col
    model['regression_type'] = regression_type
    model['regression_model'] = regression_model
    model['parameters'] = params
    model['model_parameters'] = out_table1
    model['prediction_residual'] = out_table
    model['_repr_brtc_'] = rb.get()

    return {'model': model}
Ejemplo n.º 20
0
def _hierarchical_clustering(table,
                             input_cols,
                             input_mode='original',
                             key_col=None,
                             link='complete',
                             met='euclidean',
                             num_rows=20,
                             figure_height=6.4,
                             orient='right'):
    out_table = table.copy()
    features = out_table[input_cols]

    if input_mode == 'original':
        len_features = len(features)
        if key_col != None:
            data_names = list(out_table[key_col])
        elif key_col == None:
            data_names = ['pt_' + str(i) for i in range(len_features)]
        out_table['name'] = data_names
        Z = linkage(ssd.pdist(features, metric=met), method=link, metric=met)
    elif input_mode == 'matrix':
        len_features = len(input_cols)
        if key_col != None:
            data_names = []
            for column in input_cols:
                data_names.append(
                    out_table[key_col][out_table.columns.get_loc(column)])
        elif key_col == None:
            data_names = []
            for column in input_cols:
                data_names.append(
                    out_table.columns[out_table.columns.get_loc(column)])
        col_index = []
        for column in input_cols:
            col_index.append(out_table.columns.get_loc(column))
        dist_matrix = features.iloc[col_index]

        Z = linkage(ssd.squareform(dist_matrix), method=link, metric=met)
        dist_matrix['name'] = data_names
    else:
        raise_runtime_error("Please check 'input_mode'.")

    range_len_Z = range(len(Z))
    linkage_matrix = pd.DataFrame([])
    linkage_matrix['linkage step'] = [
        '%g' % (x + 1) for x in reversed(range_len_Z)
    ]
    linkage_matrix['name of clusters'] = [
        'CL_%g' % (i + 1) for i in reversed(range_len_Z)
    ]
    joined_column1 = []
    for i in range_len_Z:
        if Z[:, 0][i] < len_features:
            joined_column1.append(data_names[int(Z[:, 0][i])])
        elif Z[:, 0][i] >= len_features:
            joined_column1.append(
                linkage_matrix['name of clusters'][Z[:, 0][i] - len_features])
    linkage_matrix['joined column1'] = joined_column1
    joined_column2 = []
    for i in range_len_Z:
        if Z[:, 1][i] < len_features:
            joined_column2.append(data_names[int(Z[:, 1][i])])
        elif Z[:, 1][i] >= len_features:
            joined_column2.append(
                linkage_matrix['name of clusters'][Z[:, 1][i] - len_features])
    linkage_matrix['joined column2'] = joined_column2

    linkage_matrix['distance'] = [distance for distance in Z[:, 2]]
    linkage_matrix['number of original'] = [
        int(entities) for entities in Z[:, 3]
    ]
    linkage_matrix = linkage_matrix.reindex(
        index=linkage_matrix.index[::-1])[0:]

    # calculate full dendrogram

    plt.figure(figsize=(8.4, figure_height))
    dendrogram(Z,
               truncate_mode='none',
               get_leaves=True,
               orientation=orient,
               labels=data_names,
               leaf_rotation=45,
               leaf_font_size=10.,
               show_contracted=False)
    plt.title('Hierarchical Clustering Dendrogram')
    if orient == 'top':
        plt.xlabel('Samples')
        plt.ylabel('Distance')
    elif orient == 'right':
        plt.xlabel('Distance')
        plt.ylabel('Samples')
    plt.tight_layout()
    plt2 = plt2MD(plt)
    plt.clf()

    params = {
        'Input Columns': input_cols,
        'Input Mode': input_mode,
        'Linkage Method': link,
        'Metric': met,
        'Number of Rows in Linkage Matrix': num_rows
    }

    rb = BrtcReprBuilder()
    rb.addMD(strip_margin("""# Hierarchical Clustering Result"""))
    rb.addMD(
        strip_margin("""
    |### Dendrogram
    |
    |{image}
    |
    |### Parameters
    |
    |{display_params}
    |
    |### Linkage Matrix
    |
    |{out_table1}
    |
    """.format(image=plt2,
               display_params=dict2MD(params),
               out_table1=pandasDF2MD(linkage_matrix.head(num_rows),
                                      num_rows=num_rows + 1))))

    model = _model_dict('hierarchical_clustering')
    model['model'] = Z
    model['input_mode'] = input_mode
    model['table'] = out_table
    if input_mode == 'matrix':
        model['dist_matrix'] = dist_matrix
    model['parameters'] = params
    model['linkage_matrix'] = linkage_matrix
    model['_repr_brtc_'] = rb.get()

    return {'model': model}
Ejemplo n.º 21
0
def _linear_regression_train(table,
                             feature_cols,
                             label_col,
                             fit_intercept=True,
                             is_vif=False,
                             vif_threshold=10):
    feature_names, features = check_col_type(table, feature_cols)
    label = table[label_col]

    if fit_intercept == True:
        features = sm.add_constant(features, has_constant='add')
        lr_model_fit = sm.OLS(label, features).fit()
    else:
        lr_model_fit = sm.OLS(label, features).fit()

    predict = lr_model_fit.predict(features)
    residual = label - predict

    summary = lr_model_fit.summary()
    summary_tables = simple_tables2df_list(summary.tables, drop_index=True)
    summary0 = summary_tables[0]
    summary1 = summary_tables[1]

    if type(features) != type(table):
        features = pd.DataFrame(features)

    if is_vif:
        summary1['VIF'] = [
            variance_inflation_factor(features.values, i)
            for i in range(features.shape[1])
        ]
        summary1['VIF>{}'.format(vif_threshold)] = summary1['VIF'].apply(
            lambda _: 'true' if _ > vif_threshold else 'false')
    summary.tables[1] = _df_to_simpletable(summary1)
    summary2 = summary_tables[2]

    html_result = summary.as_html()

    plt.figure()
    plt.scatter(predict, label)
    plt.xlabel('Predicted values for ' + label_col)
    plt.ylabel('Actual values for ' + label_col)
    x = predict
    p1x = np.min(x)
    p2x = np.max(x)
    plt.plot([p1x, p2x], [p1x, p2x], 'r--')
    fig_actual_predict = plt2MD(plt)

    plt.figure()
    plt.scatter(predict, residual)
    plt.xlabel('Predicted values for ' + label_col)
    plt.ylabel('Residuals')
    plt.axhline(y=0, color='r', linestyle='--')
    fig_residual_1 = plt2MD(plt)

    plt.figure()
    sm.qqplot(residual, line='s')
    plt.ylabel('Residuals')
    fig_residual_2 = plt2MD(plt)

    plt.figure()
    sns.distplot(residual)
    plt.xlabel('Residuals')
    fig_residual_3 = plt2MD(plt)

    rb = BrtcReprBuilder()
    rb.addMD(
        strip_margin("""
    | ## Linear Regression Result
    | ### Summary
    |
    """))
    rb.addHTML(html_result)
    rb.addMD(
        strip_margin("""
    |
    | ### Predicted vs Actual
    | {image1}
    |
    | ### Fit Diagnostics
    | {image2}
    | {image3}
    | {image4}
    """.format(image1=fig_actual_predict,
               image2=fig_residual_1,
               image3=fig_residual_2,
               image4=fig_residual_3)))

    model = _model_dict('linear_regression_model')
    model['features'] = feature_cols
    model['label'] = label_col
    model['coefficients'] = lr_model_fit.params
    model['fit_intercept'] = fit_intercept
    model['r2'] = lr_model_fit.rsquared
    model['adjusted_r2'] = lr_model_fit.rsquared_adj
    model['aic'] = lr_model_fit.aic
    model['bic'] = lr_model_fit.bic
    model['f_static'] = lr_model_fit.fvalue
    model['tvalues'] = lr_model_fit.tvalues
    model['pvalues'] = lr_model_fit.pvalues
    model['_repr_brtc_'] = rb.get()

    model['summary0'] = summary0
    model['summary1'] = summary1
    model['summary2'] = summary2
    lr_model_fit.remove_data()
    model['lr_model'] = lr_model_fit
    return {'model': model}
Ejemplo n.º 22
0
def _kmeans_silhouette_train_predict(table, input_cols, n_clusters_list=range(2, 10), prediction_col='prediction',
                                     init='k-means++', n_init=10, max_iter=300, tol=1e-4, precompute_distances='auto',
                                     seed=None, n_jobs=1, algorithm='auto', n_samples=None):
    
    feature_names, features = check_col_type(table, input_cols)

    if n_samples is None:
        n_samples = len(table)
    inputarr = features
    
    pca2_model = PCA(n_components=min(2, len(feature_names))).fit(inputarr)
    pca2 = pca2_model.transform(inputarr)
    
    silhouette_list = []
    models = []
    centers_list = []
    images = []
    for k in n_clusters_list:
        k_means = SKKMeans(n_clusters=k, init=init, n_init=n_init, max_iter=max_iter, tol=tol,
                           precompute_distances=precompute_distances, verbose=0, random_state=seed, copy_x=True,
                           n_jobs=n_jobs, algorithm=algorithm)
        k_means.fit(inputarr)
        models.append(k_means)
        predict = k_means.labels_
        centersk = k_means.cluster_centers_
        centers_list.append(centersk)
        
        score = silhouette_score(inputarr, predict)
        silhouette_list.append(score)
        samples = silhouette_samples(inputarr, predict)
        # silhouette_samples_list.append(samples)
    
        pca2_centers = pca2_model.transform(centersk)

        _, (ax1, ax2) = plt.subplots(1, 2)
        colors = cm.nipy_spectral(np.arange(k).astype(float) / k)
        y_lower = 0

        for i, color in zip(range(k), colors):
            si = samples[predict == i]
            si.sort()

            sizei = si.shape[0]
            y_upper = y_lower + sizei

            ax1.fill_betweenx(np.arange(y_lower, y_upper),
                              0, si,
                              facecolor=color, edgecolor=color, alpha=0.7)
            
            # cluster label
            ax1.text(0.9, y_lower + 0.45 * sizei, str(i))

            y_lower = y_upper
            
            if pca2.shape[1] == 1:
                ax2.scatter(pca2[:, 0][predict == i], pca2[:, 0][predict == i], color=color)
            else:
                ax2.scatter(pca2[:, 0][predict == i], pca2[:, 1][predict == i], color=color)

        ax1.axvline(x=score, color="red")
        ax1.set_xlim(right=1.0)
        ax1.set_yticks([])
        ax1.set_xlabel("Silhouette coefficient values")
        ax1.set_ylabel("Cluster label")
        
        if pca2.shape[1] == 1:
            ax2.scatter(pca2_centers[:, 0], pca2_centers[:, 0], marker='x', edgecolors=1, s=200, color=colors)
            ax2.set_xlabel("Feature space for the 1st feature")
            ax2.set_ylabel("Feature space for the 1st feature")
        else:
            ax2.scatter(pca2_centers[:, 0], pca2_centers[:, 1], marker='x', edgecolors=1, s=200, color=colors)
            ax2.set_xlabel("Feature space for the 1st feature")
            ax2.set_ylabel("Feature space for the 2nd feature")   
        
        plt.tight_layout()
        imagek = plt2MD(plt)
        plt.clf()
        images.append(imagek)
    
    argmax = np.argmax(silhouette_list)
    best_k = n_clusters_list[argmax]
    best_model = models[argmax]
    predict = best_model.predict(inputarr)
    best_centers = best_model.cluster_centers_
    best_labels = best_model.labels_
    best_sse = best_model.inertia_
    
    n_clusters = len(best_centers)
    colors = cm.nipy_spectral(np.arange(n_clusters).astype(float) / n_clusters)
    fig_centers = _kmeans_centers_plot(feature_names, best_centers, colors)
    fig_samples = _kmeans_samples_plot(table, input_cols, n_samples, best_centers, seed, colors)
    fig_pca = _kmeans_pca_plot(predict, best_centers, pca2_model, pca2, colors)
    
    x_clusters = range(len(n_clusters_list))
    plt.xticks(x_clusters, n_clusters_list)
    plt.plot(x_clusters, silhouette_list, '.-')
    plt.xlabel("Number of Clusters k")
    plt.tight_layout()
    fig_silhouette = plt2MD(plt)
    plt.clf()
    
    rb = BrtcReprBuilder()
    rb.addMD(strip_margin("""
    | ## Kmeans Silhouette Result
    | - silhoutte metrics:
    | {fig_silhouette}
    | - best K: {best_k} 
    | - Sum of square error: {best_sse}.
    | - best centers:
    | {fig_pca}
    | {fig_centers}
    | {fig_samples}
    |
    """.format(fig_silhouette=fig_silhouette, best_k=best_k, best_sse=best_sse, fig_pca=fig_pca, fig_centers=fig_centers,
               fig_samples=fig_samples)))

    for k, image in zip(n_clusters_list, images):
        rb.addMD(strip_margin("""
        | ### k = {k}
        | {image}
        |
        """.format(k=k, image=image)))

    model = _model_dict('kmeans_silhouette')
    model['best_k'] = best_k
    model['best_centers'] = best_centers
    model['best_model'] = best_model
    model['input_cols'] = input_cols
    model['_repr_brtc_'] = rb.get()
    
    out_table = table.copy()
    out_table[prediction_col] = predict
    # out_table['silhouette'] = silhouette_samples_list[best_k-2]
    # out_table = out_table.sort_values(by=['prediction','silhouette'])  
    # out_table = out_table.reset_index(drop=True)
        
    return {'out_table': out_table, 'model': model}
Ejemplo n.º 23
0
def _evaluate_classification(table,
                             label_col,
                             prediction_col,
                             average="weighted"):
    if average == 'None':
        average = None
    label = table[label_col]
    predict = table[prediction_col]

    # compute metrics
    accuracy = accuracy_score(label, predict)
    f1 = f1_score(label, predict, average=average)
    precision = precision_score(label, predict, average=average)
    recall = recall_score(label, predict, average=average)
    class_names = np.unique(np.union1d(label.values, predict.values))

    # Plot non-normalized confusion matrix
    plt.figure()
    _plot_confusion_matrix(label,
                           predict,
                           classes=class_names,
                           title='Confusion matrix, without normalization')
    fig_cnf_matrix = plt2MD(plt)
    # Plot normalized confusion matrix
    plt.figure()
    _plot_confusion_matrix(label,
                           predict,
                           classes=class_names,
                           normalize=True,
                           title='Normalized confusion matrix')
    fig_cnf_matrix_normalized = plt2MD(plt)
    plt.clf()

    # json
    summary = dict()
    summary['label_col'] = label_col
    summary['prediction_col'] = prediction_col
    summary['f1_score'] = f1
    summary['accuracy_score'] = accuracy
    summary['precision_score'] = precision
    summary['recall_score'] = recall

    # report
    if average == 'weighted' or average == 'macro':
        all_dict_list = [{'f1': f1, 'precision': precision, 'recall': recall}]
        all_df = pd.DataFrame(all_dict_list)
        all_df = all_df[['f1', 'precision', 'recall']]
    else:
        all_dict_list = [f1, precision, recall]
        all_df = pd.DataFrame(all_dict_list)
        all_df = all_df.transpose()
        all_df.columns = ['f1', 'precision', 'recall']
        all_df['label'] = set(label)
        all_df = all_df[['label'] + all_df.columns[:-1].tolist()]
    summary['metrics'] = all_df

    rb = BrtcReprBuilder()
    rb.addMD(
        strip_margin("""
    | ## Evaluate Classification Result
    |
    | ### Accuracy : {accuracy}
    |
    | ### Metrics
    | {table1}
    |
    | ### Confusion matrix
    | {fig_confusion_matrix}
    |
    | {fig_confusion_matrix_normalized}
    |
    """.format(accuracy=accuracy,
               table1=pandasDF2MD(all_df),
               fig_confusion_matrix=fig_cnf_matrix,
               fig_confusion_matrix_normalized=fig_cnf_matrix_normalized)))
    summary['_repr_brtc_'] = rb.get()

    return {'result': summary}
Ejemplo n.º 24
0
def _pca(table,
         input_cols,
         new_column_name='projected_',
         n_components=None,
         copy=True,
         whiten=False,
         svd_solver='auto',
         tol=0.0,
         iterated_power='auto',
         seed=None,
         hue=None,
         alpha=0,
         key_col=None):

    num_feature_cols = len(input_cols)
    if n_components is None:
        n_components = num_feature_cols

    pca = PCA(None,
              copy,
              whiten,
              svd_solver,
              tol,
              iterated_power,
              random_state=seed)
    pca_model = pca.fit(table[input_cols])

    column_names = []
    for i in range(0, n_components):
        column_names.append(new_column_name + str(i))
    # print(column_names)

    pca_result = pca_model.transform(table[input_cols])
    out_df = pd.DataFrame(data=pca_result[:, :n_components],
                          columns=[column_names])

    out_df = pd.concat([table.reset_index(drop=True), out_df], axis=1)
    out_df.columns = table.columns.values.tolist() + column_names

    res_components = pca_model.components_
    res_components_df = pd.DataFrame(data=res_components[:n_components],
                                     columns=[input_cols])
    res_explained_variance = pca_model.explained_variance_
    res_explained_variance_ratio = pca_model.explained_variance_ratio_
    res_singular_values = pca_model.singular_values_
    res_mean = pca_model.mean_
    res_n_components = pca_model.n_components_
    res_noise_variance = pca_model.noise_variance_

    res_get_param = pca_model.get_params()
    res_get_covariance = pca_model.get_covariance()
    res_get_precision = pca_model.get_precision()

    # visualization
    plt.figure()
    if n_components == 1:
        sns.scatterplot(column_names[0], column_names[0], hue=hue, data=out_df)
        plt_two = plt2MD(plt)
        plt.clf()
    else:
        plt_two = _biplot(
            0,
            1,
            pc_columns=column_names,
            columns=input_cols,
            singular_values=res_singular_values,
            components=res_components,
            explained_variance_ratio=res_explained_variance_ratio,
            alpha=alpha,
            hue=hue,
            data=out_df,
            ax=plt.gca(),
            key_col=key_col)

    plt.figure()
    fig_scree = _screeplot(res_explained_variance,
                           res_explained_variance_ratio, n_components)

    table_explained_variance = pd.DataFrame(res_explained_variance,
                                            columns=['explained_variance'])
    table_explained_variance[
        'explained_variance_ratio'] = res_explained_variance_ratio
    table_explained_variance[
        'cum_explained_variance_ratio'] = res_explained_variance_ratio.cumsum(
        )

    rb = BrtcReprBuilder()
    rb.addMD(
        strip_margin("""
    | ## PCA Result
    | ### Plot
    | {image1}
    |
    | ### Explained Variance
    | {fig_scree}
    | {table_explained_variance}    
    |
    | ### Components
    | {table2}
    |
    | ### Parameters
    | {parameter1}
    """.format(image1=plt_two,
               fig_scree=fig_scree,
               table_explained_variance=pandasDF2MD(table_explained_variance),
               parameter1=dict2MD(res_get_param),
               table2=pandasDF2MD(res_components_df))))

    model = _model_dict('pca')
    model['components'] = res_components
    model['explained_variance'] = res_explained_variance
    model['explained_variance_ratio'] = res_explained_variance_ratio
    model['singular_values'] = res_singular_values
    model['mean'] = res_mean
    model['n_components'] = res_n_components
    model['noise_variance'] = res_noise_variance
    model['parameters'] = res_get_param
    model['covariance'] = res_get_covariance
    model['precision'] = res_get_precision
    model['_repr_brtc_'] = rb.get()
    model['pca_model'] = pca_model
    model['input_cols'] = input_cols

    return {'out_table': out_df, 'model': model}
Ejemplo n.º 25
0
def _biplot(xidx,
            yidx,
            data,
            pc_columns,
            columns,
            singular_values,
            components,
            explained_variance_ratio,
            alpha=1,
            ax=None,
            hue=None,
            key_col=None):
    if ax is None:
        ax = plt.gca()

    xs = data[pc_columns[xidx]] * singular_values[xidx]**alpha
    ys = data[pc_columns[yidx]] * singular_values[yidx]**alpha

    if key_col is not None and hue is not None:
        groups = data[hue].unique()
        k = len(data[hue].unique())
        colors = cm.viridis(np.arange(k).astype(float) / k)
        for j, color in zip(range(k), colors):
            group_data = data[data[hue] == groups[j]]
            for idx in group_data.index:
                ax.text(xs[idx],
                        ys[idx],
                        data[key_col][idx],
                        color=color,
                        va='center',
                        ha='center')
        ax.legend([Patch(color=colors[i]) for i, _ in enumerate(groups)],
                  groups.tolist())
    elif key_col is not None and hue is None:
        for i in range(data.shape[0]):
            ax.text(xs[i],
                    ys[i],
                    data[key_col][i],
                    color='black',
                    va='center',
                    ha='center')
    elif hue is not None:
        sns.scatterplot(xs, ys, hue=data[hue], data=data, ax=ax)
    else:
        sns.scatterplot(xs, ys, data=data, ax=ax)

    ax.set_xlabel('%s (%0.4f)' %
                  (pc_columns[xidx], explained_variance_ratio[xidx]))
    ax.set_ylabel('%s (%0.4f)' %
                  (pc_columns[yidx], explained_variance_ratio[yidx]))

    axs = components[xidx] * singular_values[xidx]**(1 - alpha)
    ays = components[yidx] * singular_values[yidx]**(1 - alpha)

    xmax = np.amax(np.concatenate((xs, axs * 1.5)))
    xmin = np.amin(np.concatenate((xs, axs * 1.5)))
    ymax = np.amax(np.concatenate((ys, ays * 1.5)))
    ymin = np.amin(np.concatenate((ys, ays * 1.5)))

    for i, col in enumerate(columns):
        x, y = axs[i], ays[i]
        ax.arrow(0, 0, x, y, color='r', width=0.001, head_width=0.05)
        ax.text(x * 1.3, y * 1.3, col, color='r', ha='center', va='center')

    ys, ye = ax.get_ylim()
    xs, xe = ax.get_xlim()

    m = 1.2
    ax.set_xlim(xmin * m, xmax * m)
    ax.set_ylim(ymin * m, ymax * m)

    # plt.title('PCA result with two components')
    # plt.show()
    plt_two = plt2MD(plt)
    plt.clf()

    return plt_two
Ejemplo n.º 26
0
def _plot_binary(label,
                 probability,
                 threshold=None,
                 fig_size=(6.4, 4.8),
                 pos_label=None):
    fpr, tpr, threshold_roc = roc_curve(label,
                                        probability,
                                        pos_label=pos_label)
    # tpf 1-fpr
    if threshold is None:
        argmin = np.argmin(np.abs(tpr + fpr - 1))
        threshold = threshold_roc[argmin]

    fpr_prop = fpr[argmin]
    tpr_prop = tpr[argmin]
    plt.plot(threshold_roc, tpr, color='blue', label='TPR')
    plt.plot(threshold_roc, 1 - fpr, color='red', label='1-FPR')
    plt.xlabel('Threshold')
    plt.ylabel('TPR or 1-FPR')
    plt.legend(loc="lower center")
    plt.axvline(threshold, linestyle='--')
    plt.text(threshold + 0.02,
             0.5,
             'threshold: %0.2f' % threshold,
             rotation=90,
             verticalalignment='center')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    fig_tpr_fpr = plt2MD(plt)
    plt.clf()

    # roc
    auc_score = auc(fpr, tpr)
    plt.figure(figsize=fig_size)
    plt.plot(fpr,
             tpr,
             color='darkorange',
             label='ROC curve (area = %0.2f)' % auc_score)
    plt.plot([0, 1], [0, 1], color='navy', linestyle='--')
    plt.plot(fpr_prop,
             tpr_prop,
             'g*',
             markersize=10,
             color="red",
             label='threshold: %0.2f' % threshold)
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic')
    plt.legend(loc="lower right")
    fig_roc = plt2MD(plt)
    plt.clf()

    # pr
    precision, recall, threshold_pr = precision_recall_curve(
        label, probability, pos_label=pos_label)
    precision_prop = precision[argmin]
    recall_prop = recall[argmin]

    step_kwargs = ({
        'step': 'post'
    } if 'step' in signature(plt.fill_between).parameters else {})
    plt.step(recall, precision, color='b', alpha=0.2, where='post')
    plt.fill_between(recall, precision, alpha=0.2, color='b', **step_kwargs)
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.ylim([0.0, 1.05])
    plt.xlim([0.0, 1.0])
    plt.plot(recall_prop,
             precision_prop,
             'g*',
             markersize=10,
             color="red",
             label='threshold: %0.2f' % threshold)
    plt.title('Precision-Recall curve')  # TODO Average precision score
    plt.legend()
    fig_pr = plt2MD(plt)
    plt.clf()

    threshold_pr = np.append(threshold_pr, 1)
    plt.plot(threshold_pr, precision, color='blue', label='Precision')
    plt.plot(threshold_pr, recall, color='red', label='Recall')
    plt.xlabel('Threshold')
    plt.ylabel('Precision or Recall')
    plt.legend(loc="lower center")
    plt.axvline(threshold, linestyle='--')
    plt.text(threshold + 0.02,
             0.5,
             'threshold: %0.2f' % threshold,
             rotation=90,
             verticalalignment='center')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    fig_precision_recall = plt2MD(plt)
    plt.clf()

    classes = label.unique()
    neg_label = [cls for cls in classes if cls != pos_label][0]
    predict = probability.apply(lambda x: pos_label
                                if x >= threshold else neg_label)

    _plot_confusion_matrix(label,
                           predict, [pos_label, neg_label],
                           normalize=False,
                           title='Confusion matrix',
                           cmap=plt.cm.Blues)
    fig_confusion = plt2MD(plt)
    plt.clf()

    return threshold, fig_tpr_fpr, fig_roc, fig_precision_recall, fig_pr, fig_confusion
Ejemplo n.º 27
0
def _decision_tree_classification_train(
        table,
        feature_cols,
        label_col,  # fig_size=np.array([6.4, 4.8]),
        criterion='gini',
        splitter='best',
        max_depth=None,
        min_samples_split=2,
        min_samples_leaf=1,
        min_weight_fraction_leaf=0.0,
        max_features=None,
        random_state=None,
        max_leaf_nodes=None,
        min_impurity_decrease=0.0,
        min_impurity_split=None,
        class_weight=None,
        presort=False,
        sample_weight=None,
        check_input=True,
        X_idx_sorted=None):

    y_train = table[label_col]

    if (sklearn_utils.multiclass.type_of_target(y_train) == 'continuous'):
        raise_error('0718', 'label_col')

    classifier = DecisionTreeClassifier(
        criterion, splitter, max_depth, min_samples_split, min_samples_leaf,
        min_weight_fraction_leaf, max_features, random_state, max_leaf_nodes,
        min_impurity_decrease, min_impurity_split, class_weight, presort)
    classifier.fit(table[feature_cols], table[label_col], sample_weight,
                   check_input, X_idx_sorted)

    try:
        from sklearn.externals.six import StringIO
        from sklearn.tree import export_graphviz
        import pydotplus
        dot_data = StringIO()
        export_graphviz(classifier,
                        out_file=dot_data,
                        feature_names=feature_cols,
                        class_names=table[label_col].astype('str').unique(),
                        filled=True,
                        rounded=True,
                        special_characters=True)
        graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
        from brightics.common.repr import png2MD
        fig_tree = png2MD(graph.create_png())
    except:
        fig_tree = "Graphviz is needed to draw a Decision Tree graph. Please download it from http://graphviz.org/download/ and install it to your computer."

    # json
    model = _model_dict('decision_tree_classification_model')
    model['feature_cols'] = feature_cols
    model['label_col'] = label_col
    model['classes'] = classifier.classes_
    feature_importance = classifier.feature_importances_
    model['feature_importance'] = feature_importance
    model['max_features'] = classifier.max_features_
    model['n_classes'] = classifier.n_classes_
    model['n_features'] = classifier.n_features_
    model['n_outputs'] = classifier.n_outputs_
    model['tree'] = classifier.tree_
    get_param = classifier.get_params()
    model['parameters'] = get_param
    model['classifier'] = classifier

    # report
    indices = np.argsort(feature_importance)
    sorted_feature_cols = np.array(feature_cols)[indices]

    plt.title('Feature Importances')
    plt.barh(range(len(indices)),
             feature_importance[indices],
             color='b',
             align='center')
    for i, v in enumerate(feature_importance[indices]):
        plt.text(v,
                 i,
                 " {:.2f}".format(v),
                 color='b',
                 va='center',
                 fontweight='bold')
    plt.yticks(range(len(indices)), sorted_feature_cols)
    plt.xlabel('Relative Importance')
    plt.xlim(0, 1.1)
    plt.tight_layout()
    fig_feature_importances = plt2MD(plt)
    plt.clf()

    params = dict2MD(get_param)

    # Add tree plot
    rb = BrtcReprBuilder()
    rb.addMD(
        strip_margin("""
    | ## Decision Tree Classification Train Result
    | ### Decision Tree
    | {fig_tree}
    |
    | ### Feature Importance
    | {fig_feature_importances}
    |
    | ### Parameters
    | {list_parameters}
    |
    """.format(fig_tree=fig_tree,
               fig_feature_importances=fig_feature_importances,
               list_parameters=params)))
    model['_repr_brtc_'] = rb.get()

    return {'model': model}
Ejemplo n.º 28
0
def _xgb_regression_train(table,
                          feature_cols,
                          label_col,
                          max_depth=3,
                          learning_rate=0.1,
                          n_estimators=100,
                          silent=True,
                          objectibe='reg:linear',
                          booster='gbtree',
                          n_jobs=1,
                          nthread=None,
                          gamma=0,
                          min_child_weight=1,
                          max_delta_step=0,
                          subsample=1,
                          colsample_bytree=1,
                          colsample_bylevel=1,
                          reg_alpha=0,
                          reg_lambda=1,
                          scale_pos_weight=1,
                          base_score=0.5,
                          random_state=0,
                          seed=None,
                          missing=None,
                          sample_weight=None,
                          eval_set=None,
                          eval_metric=None,
                          early_stopping_rounds=None,
                          verbose=True,
                          xgb_model=None,
                          sample_weight_eval_set=None):

    validate(greater_than_or_equal_to(max_depth, 1, 'max_depth'),
             greater_than_or_equal_to(learning_rate, 0.0, 'learning_rate'),
             greater_than_or_equal_to(n_estimators, 1, 'n_estimators'))

    regressor = XGBRegressor(max_depth, learning_rate, n_estimators, silent,
                             objectibe, booster, n_jobs, nthread, gamma,
                             min_child_weight, max_delta_step, subsample,
                             colsample_bytree, colsample_bylevel, reg_alpha,
                             reg_lambda, scale_pos_weight, base_score,
                             random_state, seed, missing)
    regressor.fit(table[feature_cols], table[label_col], sample_weight,
                  eval_set, eval_metric, early_stopping_rounds, verbose,
                  xgb_model, sample_weight_eval_set)

    # json
    get_param = regressor.get_params()
    feature_importance = regressor.feature_importances_
    #     plt.rcdefaults()
    plot_importance(regressor)
    plt.tight_layout()
    fig_plot_importance = plt2MD(plt)
    plt.clf()
    #     plt.rcParams['figure.dpi'] = figure_dpi
    #     plot_tree(regressor)
    #     fig_plot_tree_UT = plt2MD(plt)
    #     plt.clf()
    #     plt.rcParams['figure.dpi'] = figure_dpi
    #     plot_tree(regressor, rankdir='LR')
    #     fig_plot_tree_LR = plt2MD(plt)
    #     plt.rcdefaults()
    #     plt.clf()

    out_model = _model_dict('xgb_regression_model')
    out_model['feature_cols'] = feature_cols
    out_model['label_col'] = label_col
    out_model['parameters'] = get_param
    out_model['feature_importance'] = feature_importance
    out_model['regressor'] = regressor
    out_model['plot_importance'] = fig_plot_importance
    #     out_model['plot_tree_UT'] = fig_plot_tree_UT
    #     out_model['plot_tree_LR'] = fig_plot_tree_LR
    #         out_model['to_graphviz'] = md_to_graphviz

    # report
    get_param_list = []
    get_param_list.append(['feature_cols', feature_cols])
    get_param_list.append(['label_col', label_col])
    for key, value in get_param.items():
        temp = [key, value]
        get_param_list.append(temp)
    get_param_df = pd.DataFrame(data=get_param_list,
                                columns=['parameter', 'value'])
    feature_importance_df = pd.DataFrame(data=feature_importance,
                                         index=feature_cols).T

    rb = BrtcReprBuilder()
    rb.addMD(
        strip_margin("""
    | ## XGB Regression Result
    |
    | ### Plot Importance
    | {image_importance}
    |
    | ### Feature Importance
    | {table_feature_importance}
    |
    | ### Parameters
    | {table_parameter}
    |
    """.format(image_importance=fig_plot_importance,
               table_feature_importance=pandasDF2MD(feature_importance_df, 20),
               table_parameter=pandasDF2MD(get_param_df))))
    out_model['_repr_brtc_'] = rb.get()

    return {'model': out_model}
Ejemplo n.º 29
0
def _oneway_anova(table, response_cols, factor_col):
    rb = BrtcReprBuilder()
    rb.addMD(
        strip_margin("""
    ## One-way Analysis of Variance Result
    """))
    groups = table[factor_col].unique()
    groups.sort()
    sum_len = np.sum([len(str(group)) for group in groups])

    result = dict()
    result['_grouped_data'] = dict()

    for response_col in response_cols:
        data = table[response_col]
        result['_grouped_data'][response_col] = dict()

        ax = sns.boxplot(x=factor_col,
                         y=response_col,
                         data=table,
                         order=groups)
        if sum_len > 512:
            ax.set_xticklabels(ax.get_xticklabels(), rotation=90)
        elif sum_len > 64:
            ax.set_xticklabels(ax.get_xticklabels(), rotation=45)

        fig_box = plt2MD(plt)
        plt.clf()

        model = ols(
            """Q('{response_col}') ~ C(Q('{factor_col}'))""".format(
                response_col=response_col, factor_col=factor_col),
            table).fit()  # TODO factor_col = class => error
        anova = anova_lm(model)

        index_list = anova.index.tolist()
        remove_list = ["C(Q('", "'))", "Q('", "')"]
        for v in remove_list:
            index_list = [i.replace(v, "") for i in index_list]
        anova.insert(0, '', index_list)

        anova_df = pandasDF2MD(anova)

        p_value = anova["""PR(>F)"""][0]

        residual = model.resid

        sns.distplot(residual)
        distplot = plt2MD(plt)
        plt.clf()

        sm.qqplot(residual, line='s')
        qqplot = plt2MD(plt)
        plt.clf()

        rb.addMD(
            strip_margin("""
        | ## {response_col} by {factor_col}
        | {fig_box}
        |
        | ### ANOVA
        | {anova_df}
        | 
        | ### Diagnostics
        | {distplot}
        |
        | {qqplot}
        """.format(response_col=response_col,
                   factor_col=factor_col,
                   fig_box=fig_box,
                   anova_df=anova_df,
                   distplot=distplot,
                   qqplot=qqplot)))

        result['_grouped_data'][response_col]['p_value'] = p_value

    result['_repr_brtc_'] = rb.get()
    return {'result': result}
Ejemplo n.º 30
0
def _naive_bayes_train(table, feature_cols, label_col, alpha=1.0, fit_prior=True, class_prior=None):
    features = table[feature_cols]
    label = table[label_col]
    label_encoder = preprocessing.LabelEncoder()
    label_encoder.fit(label)
    label_correspond = label_encoder.transform(label)

    if class_prior is not None:
        tmp_class_prior = [0] * len(class_prior)
        for elems in class_prior: 
            tmp = elems.split(":")
            tmp_class_prior[label_encoder.transform([tmp[0]])[0]] = float(tmp[1])
        class_prior = tmp_class_prior

    nb_model = MultinomialNB(alpha, fit_prior, class_prior)
    nb_model.fit(features, label_correspond)
    class_log_prior = nb_model.class_log_prior_
    feature_log_prob_ = nb_model.feature_log_prob_
    tmp_result = np.hstack((list(map(list, zip(*[label_encoder.classes_] + [class_log_prior]))), (feature_log_prob_)))
    column_names = ['labels', 'pi']
    for feature_col in feature_cols:
        column_names += ['theta_' + feature_col]
    result_table = pd.DataFrame.from_records(tmp_result, columns=column_names)
    prediction_correspond = nb_model.predict(features)

    get_param = dict()
    get_param['Lambda'] = alpha
    # get_param['Prior Probabilities of the Classes'] = class_prior
    get_param['Fit Class Prior Probability'] = fit_prior
    get_param['Feature Columns'] = feature_cols
    get_param['Label Column'] = label_col

    cnf_matrix = confusion_matrix(label_correspond, prediction_correspond)

    plt.figure()
    _plot_confusion_matrix(cnf_matrix, classes=label_encoder.classes_,
                      title='Confusion Matrix')
    fig_confusion_matrix = plt2MD(plt)
    accuracy = nb_model.score(features, label_correspond) * 100

    rb = BrtcReprBuilder()
    rb.addMD(strip_margin("""
    | ## Naive Bayes Classification Result
    |
    | ### Model:Multinomial
    | {result_table}
    | ### Parameters
    | {table_parameter} 
    | ### Predicted vs Actual
    | {image1}
    | #### Accuacy = {accuracy}%
    |
    """.format(image1=fig_confusion_matrix, accuracy=accuracy, result_table=pandasDF2MD(result_table), table_parameter=dict2MD(get_param))))

    model = _model_dict('naive_bayes_model')
    model['features'] = feature_cols
    model['label_col'] = label_col
    model['label_encoder'] = label_encoder
    model['nb_model'] = nb_model
    model['_repr_brtc_'] = rb.get()

    return {'model' : model}