def confidence_calibration_check(round_idx, respondent_idx, d_metric_dict,
                                 n_neighbors):
    _, X_test, _, y_test, _ = load_dataset(onehot=True)
    X_test = preprocessing_data_with_unit_var(X_test)
    d_metric = d_metric_dict['R%d_W%d' % (respondent_idx, round_idx)]

    weighted_y_test = pd.DataFrame(
        weighted_prediction_round(round_idx)[respondent_idx - 1].values,
        index=y_test.index,
        columns=['weighted_y_test'])
    weighted_y_test['y_test'] = y_test
    weighted_y_test['neighbor reci prop'] = y_test

    for idx in X_test.index:
        diff = X_test.values - X_test.loc[idx].values
        X_test['distance'] = np.dot(diff @ d_metric, diff.T).diagonal()

        max_d = X_test['distance'].sort_values(
            ascending=True).values[n_neighbors]

        neighbor_list = [
            my_idx
            for my_idx in X_test.loc[X_test['distance'] <= max_d].index.values
            if my_idx != idx
        ]

        reci = 0
        for neighbor in neighbor_list:
            reci += weighted_y_test.loc[neighbor, 'y_test']
        weighted_y_test.loc[neighbor,
                            'neighbor reci prop'] = reci / n_neighbors

        X_test.drop(['distance'], axis=1, inplace=True)

    return weighted_y_test
def similarity_fig_from_metric_by_nca_part(round_idx, respondent_idx,
                                           n_neighbors, d_metric_dict):
    """
    similar as nca, only here the neighborhood is defined as the minimal hypersphere
    that contains the n_neighbors similar points of x_i

    :param round_idx: integer, ranging from 1 to 10
    :param respondent_idx: id of respondents, ranging from the 1 to 35
    :param n_neighbors: the number of similar points to define the neighborhood
    :param d_metric_dict: dict to save learned distance metric, indexed by the
    respondent_id and the round_idx
    :return: a similarity connection graph and the similarity edge list
    """
    _, X_test, _, y_test, _ = load_dataset(onehot=True)
    X_test = preprocessing_data_with_unit_var(X_test)
    d_metric = d_metric_dict['R%d_W%d' % (respondent_idx, round_idx)]
    # weighted_y_test = pd.DataFrame(weighted_prediction(round_idx)[respondent_idx - 1].values,
    #                                index=y_test.index)

    weighted_y_test = pd.DataFrame(weighted_prediction_quantile(
        weighted_prediction_round(round_idx))[respondent_idx - 1].values,
                                   index=y_test.index)

    similarity_edges = []

    for idx in X_test.index:
        diff = X_test.values - X_test.loc[idx].values
        X_test['distance'] = np.dot(diff @ d_metric, diff.T).diagonal()

        same_class = X_test.loc[(
            weighted_y_test == weighted_y_test.loc[idx]).values.ravel()]
        max_d = same_class['distance'].sort_values(ascending=True).values[min(
            len(same_class) - 1, n_neighbors)]

        #print('idx: %5d'%idx, ' max_d: %2.2f.'%max_d)
        neighbor_list = [
            my_idx
            for my_idx in X_test.loc[X_test['distance'] <= max_d].index.values
            if my_idx != idx
        ]
        for item in neighbor_list:
            similarity_edges.append((idx, item))
        X_test.drop(['distance'], axis=1, inplace=True)

    fig, edges = similarity_fig_from_weighted_prediction(
        round_idx=round_idx,
        respondent_idx=respondent_idx,
        similarity_edge_list=similarity_edges)

    return fig, edges
def similarity_fig_from_metric_by_nca(round_idx, respondent_idx,
                                      d_metric_dict):
    """
    nca -> neighborhood component analysis

    First is to define the neighborhood of data point x_i in the transformed space,
    it is the minimal hypersphere which centers at x_i, and its radius is equal to
    the farthest point which is similar with x_i. This function is to build similarity
    edges between x_i and all the data points within its neighborhood.

    :param round_idx: integer, ranging from 1 to 10
    :param respondent_idx: id of respondents, ranging from the 1 to 35
    :param d_metric_dict: dict to save learned distance metric, indexed by the
    respondent_id and the round_idx
    :return: a similarity connection graph and the similarity edge list
    """
    _, X_test, _, y_test, _ = load_dataset(onehot=True)
    X_test = preprocessing_data_with_unit_var(X_test)
    d_metric = d_metric_dict['R%d_W%d' % (respondent_idx, round_idx)]

    weighted_y_test = pd.DataFrame(weighted_prediction_quantile(
        weighted_prediction_round(round_idx))[respondent_idx - 1].values,
                                   index=y_test.index)

    similarity_edges = []

    for idx in X_test.index:
        diff = X_test.values - X_test.loc[idx].values
        X_test['distance'] = np.dot(diff @ d_metric, diff.T).diagonal()
        max_d = X_test.loc[(weighted_y_test == weighted_y_test.loc[idx]
                            ).values.ravel()]['distance'].max()

        #print('idx: %5d' % idx, ' max_d: %2.2f.' % max_d)
        neighbor_list = [
            my_idx
            for my_idx in X_test.loc[X_test['distance'] <= max_d].index.values
            if my_idx != idx
        ]
        for item in neighbor_list:
            similarity_edges.append((idx, item))
        X_test.drop(['distance'], axis=1, inplace=True)
    fig, edges = similarity_fig_from_weighted_prediction(
        round_idx=round_idx,
        respondent_idx=respondent_idx,
        similarity_edge_list=similarity_edges)

    return fig, edges
def connectivity_sanity_check(similarity_edges,
                              round_idx,
                              respondent_idx,
                              div=2):
    """
    summarize the connectivity from their amounts and the accuracy

    :param similarity_edges: edges between similar points, a list of tuples(x_i, x_j),
    indicating that x_i and x_j are similar
    :return: a dataframe to summarize the connectivity with the provided edges
    """
    _, _, _, y_test, _ = load_dataset(onehot=False)
    sum_df = pd.DataFrame(np.zeros_like(y_test.values),
                          index=y_test.index,
                          columns=['Amount'])
    sum_df['Accuracy'] = sum_df['Amount']

    weighted_y_test = pd.DataFrame(
        weighted_prediction_quantile(weighted_prediction_round(round_idx),
                                     div=div)[respondent_idx - 1].values,
        index=y_test.index)

    if type(similarity_edges) == dict:
        my_dict = similarity_edges
    else:
        my_dict = dict()
        for idx in y_test.index:
            my_dict[idx] = []
        for x_i, x_j in similarity_edges:
            my_dict[x_i].append(x_j)
            my_dict[x_j].append(x_i)

    for idx in my_dict.keys():
        sum_df.loc[idx, "Amount"] = len(my_dict[idx])
        try:
            sum_df.loc[idx, "Accuracy"] = \
                len(set(my_dict[idx]).intersection(
                    set(weighted_y_test.loc[weighted_y_test[0] ==
                                            weighted_y_test.loc[idx, 0]].index)))\
                /len(my_dict[idx])
        except ZeroDivisionError:
            # if defendant idx doesn't have any other in similarity
            sum_df.loc[idx, "Accuracy"] = 0
    return sum_df
def similarity_fig_from_weighted_prediction(round_idx,
                                            respondent_idx,
                                            similarity_edge_list=None,
                                            weighted_pred=None):
    """
    The similarity_edge_list has the priority for construction than the weighted prediction.

    :param round_idx: integer, ranging from 1 to 10
    :param respondent_idx: id of respondents, ranging from the 1 to 35
    :param similarity_edge_list: edges between similar points, a list of tuples(x_i, x_j),
    indicating that x_i and x_j are similar
    :param weighted_pred: confident of confidence weighted prediction,
    either at a specific round_idx or for a specific respondent at all times
    :return: a similarity connection graph of the 50 defendants, and the similarity_edge_list
    """
    _, X_test, _, y_test, _ = load_dataset(onehot=True)

    # (x,y) position of each nodes, here I set them into the circle for appreciation of beauty
    pos = dict()
    for i in range(50):
        pos[X_test.index.values[i]] = (np.cos(2 * np.pi * i / 50),
                                       np.sin(2 * np.pi * i / 50))
    # add all the nodes to the network
    G = nx.Graph()
    G.add_nodes_from(pos.keys())
    for n, p in pos.items():
        G.node[n]['pos'] = p

    # similarity_edge_list has the privilege to be used for the construction of the network
    if similarity_edge_list is None:
        if weighted_pred is None:
            # use the 14 classes weighted prediction if both are None
            y_test = weighted_prediction_round(
                round_idx=round_idx)[respondent_idx - 1].values.reshape(-1, 1)
        else:
            y_test = weighted_pred[respondent_idx - 1].values.reshape(-1, 1)
        # to integrate the labels into a pairwise fashion
        mask = (y_test[None] == y_test[:, None])[:, :, 0]
        a, b = np.nonzero(np.triu(mask, k=1))
        similarity_edge_list = [(X_test.index.values[a_],
                                 X_test.index.values[b_])
                                for a_, b_ in zip(a, b)]

    G.add_edges_from(similarity_edge_list)
    '''Make the Graph via Plot.ly'''
    edge_trace = go.Scatter(x=[],
                            y=[],
                            line=dict(width=0.5, color='#888'),
                            hoverinfo='none',
                            mode='lines')

    for edge in G.edges():
        x0, y0 = G.node[edge[0]]['pos']
        x1, y1 = G.node[edge[1]]['pos']
        edge_trace['x'] += tuple([x0, x1, None])
        edge_trace['y'] += tuple([y0, y1, None])

    node_trace = go.Scatter(
        x=[],
        y=[],
        text=[],
        mode='markers',
        hoverinfo='text',
        marker=dict(
            showscale=True,
            # color-scale options
            # 'Greys' | 'YlGnBu' | 'Greens' | 'YlOrRd' | 'Bluered' | 'RdBu' |
            # 'Reds' | 'Blues' | 'Picnic' | 'Rainbow' | 'Portland' | 'Jet' |
            # 'Hot' | 'Blackbody' | 'Earth' | 'Electric' | 'Viridis' |
            colorscale='YlGnBu',
            reversescale=True,
            color=[],
            size=10,
            colorbar=dict(thickness=15,
                          title='Node Connections',
                          xanchor='left',
                          titleside='right'),
            line=dict(width=2)))

    for node in G.nodes():
        x, y = G.node[node]['pos']
        node_trace['x'] += tuple([x])
        node_trace['y'] += tuple([y])

    for node, adjacencies in enumerate(G.adjacency()):
        node_trace['marker']['color'] += tuple([len(adjacencies[1])])
        node_info = '# of connections: ' + str(len(adjacencies[1])) + '<br>Defendant_ID: '\
                    + str(X_test.index.values[node])
        node_trace['text'] += tuple([node_info])

    fig = go.Figure(
        data=[edge_trace, node_trace],
        layout=go.Layout(
            width=700,
            height=600,
            title=
            '<br>Similarity Graph of COMPAS Defendants (Respondent-%2d_Week-%2d)'
            % (respondent_idx, round_idx),
            titlefont=dict(size=16),
            showlegend=False,
            hovermode='closest',
            margin=dict(b=20, l=50, r=50, t=40),
            xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
            yaxis=dict(showgrid=False, zeroline=False, showticklabels=False)))

    return fig, list(G.edges())