Beispiel #1
0
def score_divergence(codes, labels, sources, k=50, **kwargs):
    """
    Measures how well sources are mixed (smaller: well-mixed)

    Function to calculate the divergence score as described in BERMUDA

    Estimates the avg pairwise symmetric divergence of p_src and q_tgt
    i.e. .5 * D(p_src || q_tgt) + .5 D(q_tgt || p_src) for each src tgt pair

    p and q eval with a non-parametric density estimate centered at x_i
    i.e weighthed by the distance to the kth-NN from x_i for each dataset


    inputs:
        codes: merged data matrix
        labels: labels of each item (e.g. cell-type)
        sources: index of each item's source (e.g tech; data or prior)
        k: k-NN used to estimate data density
        kwargs: see preprocess_code

    outputs:
        divergence score,  non-negative
    """
    num_datasets = np.unique(sources).size
    div_pq = list()
    div_qp = list()

    # pairs of datasets
    for d1 in range(num_datasets):
        for d2 in range(d1+1, num_datasets):
            idx1, idx2, _ = separate_shared_idx(labels, sources, d1=d1, d2=d2)
            if sum(idx1) < k or sum(idx2) < k:
                continue

            pq = estimate(codes[idx1, :], codes[idx2, :], k)
            div_pq.append(max(pq, 0))

            qp = estimate(codes[idx2, :], codes[idx1, :], k)
            div_qp.append(max(qp, 0))

    # average the scores across pairs of datasets
    try:
        div_score = (sum(div_pq) / len(div_pq) + sum(div_qp) / len(div_qp)) / 2
    except ZeroDivisionError:
        div_score = np.nan
    return div_score
                        Xtrain[:, j].min() - 0.5, Xtrain[:, j].max() + 0.5,
                        Xtrain[:, i].min() - 0.5, Xtrain[:, i].max() + 0.5
                    ])
                if i > j:
                    fig.axes[i, j].axis([
                        Xtrain[:, j].min() - 0.5, Xtrain[:, j].max() + 0.5,
                        Xtrain[:, i].min() - 0.5, Xtrain[:, i].max() + 0.5
                    ])

    # plot
    plt.close('all')

    fig1 = sns.PairGrid(pd.DataFrame(Xtrain))
    fig1 = fig1.map_upper(plt.scatter, edgecolor="w")
    fig1 = fig1.map_lower(sns.kdeplot, cmap="Blues_d")
    fig1 = fig1.map_diag(sns.kdeplot, lw=3, legend=False)
    set_axes(fig1)
    savefig(1)

    x_mu = model.sample(1000)
    std = (x_mu[:, 4:8] - x_mu[:, 0:4])
    x_samp = x_mu[:, 4:8] + (std**2) * torch.randn_like(x_mu[:, 4:8])
    x_samp = x_samp[std.sum(dim=1) < 10]
    fig7 = sns.PairGrid(pd.DataFrame(x_samp.detach().numpy()))
    fig7 = fig7.map_upper(plt.scatter, edgecolor="w")
    fig7 = fig7.map_lower(sns.kdeplot, cmap="Blues_d")
    fig7 = fig7.map_diag(sns.kdeplot, lw=3, legend=False)
    set_axes(fig7)
    savefig(7)
    print(estimate(Xtrain, x_samp.detach().numpy()))
Beispiel #3
0
def evaluate_scores(div_ent_code, sil_code, cell_labels, dataset_labels,
                    num_datasets, div_ent_dim, sil_dim, sil_dist):
    """ Calculate three proposed evaluation metrics
    Args:
        div_ent_code: num_cells * num_features, embedding for divergence and entropy calculation, usually with dim of 2
        sil_code: num_cells * num_features, embedding for silhouette score calculation
        cell_labels:
        dataset_labels:
        num_datasets:
        div_ent_dim: if dimension of div_ent_code > div_ent_dim, apply PCA first
        sil_dim: if dimension of sil_code > sil_dim, apply PCA first
        sil_dist: distance metric for silhouette score calculation
    Returns:
        div_score: divergence score
        ent_score: entropy score
        sil_score: silhouette score
    """
    # calculate divergence and entropy
    if div_ent_code.shape[1] > div_ent_dim:
        div_ent_code = PCA(
            n_components=div_ent_dim).fit_transform(div_ent_code)
    div_pq = []  # divergence dataset p, q
    div_qp = []  # divergence dataset q, p
    ent = []  # entropy
    # pairs of datasets
    for d1 in range(1, num_datasets + 1):
        for d2 in range(d1 + 1, num_datasets + 1):
            idx1 = dataset_labels == d1
            idx2 = dataset_labels == d2
            labels = np.intersect1d(np.unique(cell_labels[idx1]),
                                    np.unique(cell_labels[idx2]))
            idx1_mutual = np.logical_and(idx1, np.isin(cell_labels, labels))
            idx2_mutual = np.logical_and(idx2, np.isin(cell_labels, labels))
            idx_specific = np.logical_and(
                np.logical_or(idx1, idx2),
                np.logical_not(np.isin(cell_labels, labels)))
            # divergence
            if np.sum(idx1_mutual) >= cal_min and np.sum(
                    idx2_mutual) >= cal_min:
                div_pq.append(
                    max(
                        estimate(div_ent_code[idx1_mutual, :],
                                 div_ent_code[idx2_mutual, :], cal_min), 0))
                div_qp.append(
                    max(
                        estimate(div_ent_code[idx2_mutual, :],
                                 div_ent_code[idx1_mutual, :], cal_min), 0))
            # entropy
            if (sum(idx_specific) > 0):
                ent_tmp = cal_entropy(div_ent_code, idx_specific,
                                      dataset_labels)
                ent.append(sum(ent_tmp) / len(ent_tmp))
    if len(ent) == 0:  # if no dataset specific cell types, store entropy as -1
        ent.append(-1)

    # calculate silhouette_score
    if sil_code.shape[1] > sil_dim:
        sil_code = PCA(n_components=sil_dim).fit_transform(sil_code)
    sil_scores = silhouette_samples(sil_code, cell_labels, metric=sil_dist)

    # average for scores
    div_score = (sum(div_pq) / len(div_pq) + sum(div_qp) / len(div_qp)) / 2
    ent_score = sum(ent) / len(ent)
    sil_score = sum(sil_scores) / len(sil_scores)

    return div_score, ent_score, sil_score
Beispiel #4
0
def plot_posterior(samples,
                   x_truth,
                   epoch,
                   idx,
                   run='testing',
                   other_samples=None):
    """
    plots the posteriors
    """

    if other_samples is not None:
        true_post = np.zeros([other_samples.shape[0], bilby_ol_len])
        true_x = np.zeros(inf_ol_len)
        true_XS = np.zeros([samples.shape[0], inf_ol_len])
        ol_pars = []
        cnt = 0
        for inf_idx, bilby_idx in zip(inf_ol_idx, bilby_ol_idx):
            inf_par = params['inf_pars'][inf_idx]
            bilby_par = params['bilby_pars'][bilby_idx]
            true_XS[:, cnt] = (samples[:, inf_idx] *
                               (bounds[inf_par + '_max'] -
                                bounds[inf_par + '_min'])) + bounds[inf_par +
                                                                    '_min']
            true_post[:, cnt] = (
                other_samples[:, bilby_idx] *
                (bounds[bilby_par + '_max'] -
                 bounds[bilby_par + '_min'])) + bounds[bilby_par + '_min']
            true_x[cnt] = (x_truth[inf_idx] *
                           (bounds[inf_par + '_max'] - bounds[inf_par + '_min']
                            )) + bounds[inf_par + '_min']
            ol_pars.append(inf_par)
            cnt += 1
        parnames = []
        for k_idx, k in enumerate(params['rand_pars']):
            if np.isin(k, ol_pars):
                parnames.append(params['corner_labels'][k])

        # convert to RA
        true_XS = convert_hour_angle_to_ra(true_XS, params, ol_pars)
        true_x = convert_hour_angle_to_ra(
            np.reshape(true_x, [1, true_XS.shape[1]]), params,
            ol_pars).flatten()

        # compute KL estimate
        idx1 = np.random.randint(0, true_XS.shape[0], 1000)
        idx2 = np.random.randint(0, true_post.shape[0], 1000)
        try:
            KL_est = estimate(true_XS[idx1, :], true_post[idx2, :])
        except:
            KL_est = -1.0
            pass

    else:
        # Get corner parnames to use in plotting labels
        parnames = []
        for k_idx, k in enumerate(params['rand_pars']):
            if np.isin(k, params['inf_pars']):
                parnames.append(params['corner_labels'][k])
        # un-normalise full inference parameters
        full_true_x = np.zeros(len(params['inf_pars']))
        new_samples = np.zeros([samples.shape[0], len(params['inf_pars'])])
        for inf_par_idx, inf_par in enumerate(params['inf_pars']):
            new_samples[:, inf_par_idx] = (
                samples[:, inf_par_idx] *
                (bounds[inf_par + '_max'] -
                 bounds[inf_par + '_min'])) + bounds[inf_par + '_min']
            full_true_x[inf_par_idx] = (
                x_truth[inf_par_idx] *
                (bounds[inf_par + '_max'] -
                 bounds[inf_par + '_min'])) + bounds[inf_par + '_min']
        new_samples = convert_hour_angle_to_ra(new_samples, params,
                                               params['inf_pars'])
        full_true_x = convert_hour_angle_to_ra(
            np.reshape(full_true_x, [1, samples.shape[1]]), params,
            params['inf_pars']).flatten()
        KL_est = -1.0

    # define general plotting arguments
    defaults_kwargs = dict(bins=50,
                           smooth=0.9,
                           label_kwargs=dict(fontsize=16),
                           title_kwargs=dict(fontsize=16),
                           truth_color='tab:orange',
                           quantiles=[0.16, 0.84],
                           levels=(0.68, 0.90, 0.95),
                           density=True,
                           plot_density=False,
                           plot_datapoints=True,
                           max_n_ticks=3)

    # 1-d hist kwargs for normalisation
    hist_kwargs = dict(density=True, color='tab:red')
    hist_kwargs_other = dict(density=True, color='tab:blue')

    if other_samples is None:
        figure = corner.corner(new_samples,
                               **defaults_kwargs,
                               labels=parnames,
                               color='tab:red',
                               fill_contours=True,
                               truths=x_truth,
                               show_titles=True,
                               hist_kwargs=hist_kwargs)
        plt.savefig('%s/full_posterior_epoch_%d_event_%d.png' %
                    (run, epoch, idx))
        plt.close()
    else:
        figure = corner.corner(true_post,
                               **defaults_kwargs,
                               labels=parnames,
                               color='tab:blue',
                               show_titles=True,
                               hist_kwargs=hist_kwargs_other)
        corner.corner(true_XS,
                      **defaults_kwargs,
                      color='tab:red',
                      fill_contours=True,
                      truths=true_x,
                      show_titles=True,
                      fig=figure,
                      hist_kwargs=hist_kwargs)
        plt.annotate('KL = {:.3f}'.format(KL_est), (0.2, 0.95),
                     xycoords='figure fraction',
                     fontsize=18)
        plt.savefig('%s/comp_posterior_epoch_%d_event_%d.png' %
                    (run, epoch, idx))
        plt.close()
    return KL_est
Beispiel #5
0
def evaluate_scores(code_arr, cell_labels, dataset_labels, num_datasets,
                    epoch):
    """ Calculate three proposed evaluation metrics
    Args:
        div_ent_code: num_cells * num_features, embedding for divergence and entropy calculation, usually with dim of 2
        sil_code: num_cells * num_features, embedding for silhouette score calculation
        cell_labels: true cell labels
        dataset_labels: index of different datasets
        num_datasets: number of datasets
    Returns:
        div_score: divergence score
        ent_score: entropy score
        sil_score: silhouette score
    """
    # calculate UMAP
    import umap
    fit = umap.UMAP(n_neighbors=30,
                    min_dist=0.3,
                    n_components=2,
                    metric='cosine',
                    random_state=123)
    div_ent_code = fit.fit_transform(code_arr)
    # div_ent_code = PCA(n_components=2).fit_transform(code_arr)
    # print(div_ent_code.shape)

    # calculate divergence and entropy
    div_pq = []  # divergence dataset p, q
    div_qp = []  # divergence dataset q, p
    div_pq_all = []  # divergence dataset p, q
    div_qp_all = []  # divergence dataset q, p
    ent = []  # entropy
    # pairs of datasets
    for d1 in range(1, num_datasets + 1):
        for d2 in range(d1 + 1, num_datasets + 1):
            idx1 = dataset_labels == d1
            idx2 = dataset_labels == d2  # the samples in dataset_labels belongs to which batch
            labels = np.intersect1d(
                np.unique(cell_labels[idx1]),
                np.unique(cell_labels[idx2]))  #shared cluster between datasets
            idx1_mutual = np.logical_and(idx1, np.isin(cell_labels, labels))
            idx2_mutual = np.logical_and(idx2, np.isin(cell_labels, labels))
            idx_specific = np.logical_and(
                np.logical_or(idx1, idx2),
                np.logical_not(np.isin(cell_labels, labels)))

            # Estimate univesal k-NN divergence.
            if np.sum(idx1_mutual) >= cal_min and np.sum(
                    idx2_mutual) >= cal_min:
                # calculate by cluster
                # batch_1 = div_ent_code[idx1, :]
                # batch_2 = div_ent_code[idx2, :]
                # for label_by in labels:
                #     # print(sum(label_by == cell_labels[idx1]), sum(label_by == cell_labels[idx2])) #cluster contain too little samples will lead to inf or nan
                #     #estimate(X, Y, k=None, n_jobs=1), X, Y: 2-dimensional array where each row is a sample.
                #     div_pq.append(
                #         estimate(batch_1[label_by == cell_labels[idx1], :], batch_2[label_by == cell_labels[idx2], :],
                #                  cal_min))
                #     div_qp.append(
                #         estimate(batch_2[label_by == cell_labels[idx2], :], batch_1[label_by == cell_labels[idx1], :],
                #                  cal_min))

                # calculate by all cells
                div_pq_all.append(
                    max(
                        estimate(div_ent_code[idx1_mutual, :],
                                 div_ent_code[idx2_mutual, :], cal_min), 0))
                div_qp_all.append(
                    max(
                        estimate(div_ent_code[idx2_mutual, :],
                                 div_ent_code[idx1_mutual, :], cal_min), 0))
            # entropy
            if (sum(idx_specific) > 0):
                ent_tmp = cal_entropy(div_ent_code, idx_specific,
                                      dataset_labels)
                ent.append(sum(ent_tmp) / len(ent_tmp))
    if len(ent) == 0:  # if no dataset specific cell types, store entropy as -1
        ent.append(-1)

    # # calculate silhouette_score
    # sil_code = code_arr
    # if sil_code.shape[1] > sil_dim:
    #     sil_code = PCA(n_components=2).fit_transform(sil_code)
    # sil_scores = silhouette_samples(sil_code, cell_labels, metric="euclidean")
    # print(div_ent_code.shape, sil_code.shape)

    sil_scores = silhouette_samples(div_ent_code,
                                    cell_labels,
                                    metric="euclidean")
    # sil_scores = silhouette_score(div_ent_code, cell_labels, metric="euclidean")

    # average for scores
    # div_pq = np.array(div_pq)[np.logical_and(np.isfinite(div_pq), ~np.isnan(div_pq))]
    # div_qp= np.array(div_qp)[np.logical_and(np.isfinite(div_qp), ~np.isnan(div_qp))]
    # div_score = (sum(div_pq) / len(div_pq) + sum(div_qp) / len(div_qp)) / 2
    div_score = 0
    div_score_all = (sum(div_pq_all) / len(div_pq_all) +
                     sum(div_qp_all) / len(div_qp_all)) / 2
    ent_score = sum(ent) / len(ent)
    sil_score = sum(sil_scores) / len(sil_scores)

    alignment_score = seurat_alignment_score(code_arr,
                                             dataset_labels,
                                             n=10,
                                             k=0.01)
    mixing_entropy = batch_mixing_entropy(code_arr, dataset_labels)

    print(
        "epoch: ", epoch,
        ' divergence_score: {:.3f}, {:.3f}, alignment_score, mixing_entropy: {:.3f},{:.3f} entropy_score: {:.3f}, silhouette_score: {:.3f}'
        .format(div_score, div_score_all, alignment_score, mixing_entropy,
                ent_score, sil_score))

    return div_score, div_score_all, ent_score, sil_score
Beispiel #6
0
        def compute_kl(sampset_1,sampset_2,samplers,one_D=False):
            """
            Compute KL for one test case.
            """
            
            # Remove samples outside of the prior mass distribution           
            cur_max = self.params['n_samples']
            
            # Iterate over parameters and remove samples outside of prior
            if samplers[0] == 'vitamin1' or samplers[1] == 'vitamin2':

                # Apply mask
                sampset_1 = sampset_1.T
                sampset_2 = sampset_2.T
                set1 = sampset_1
                set2 = sampset_2
                del_cnt_set1 = 0
                del_cnt_set2 = 0
                params_to_infer = self.params['inf_pars']
                for i in range(set1.shape[1]):

                    # iterate over each parameter in first set
                    for k,q in enumerate(params_to_infer):
                        # if sample out of range, delete the sample
                        if set1[k,i] < 0.0 or set1[k,i] > 1.0:
                            sampset_1 = np.delete(sampset_1,del_cnt_set1,axis=1)
                            del_cnt_set1-=1
                            break
                        # check m1 > m2
                        elif q == 'mass_1' or q == 'mass_2':
                            m1_idx = np.argwhere(params_to_infer=='mass_1')
                            m2_idx = np.argwhere(params_to_infer=='mass_2')
                            if set1[m1_idx,i] < set1[m2_idx,i]:
                                sampset_1 = np.delete(sampset_1,del_cnt_set1,axis=1)
                                del_cnt_set1-=1
                                break

                    del_cnt_set1+=1

                # iterate over each sample
                for i in range(set2.shape[1]):

                    # iterate over each parameter in second set
                    for k,q in enumerate(params_to_infer):
                        # if sample out of range, delete the sample
                        if set2[k,i] < 0.0 or set2[k,i] > 1.0:
                            sampset_2 = np.delete(sampset_2,del_cnt_set2,axis=1)
                            del_cnt_set2-=1
                            break
                        # check m1 > m2
                        elif q == 'mass_1' or q == 'mass_2':
                            m1_idx = np.argwhere(params_to_infer=='mass_1')
                            m2_idx = np.argwhere(params_to_infer=='mass_2')
                            if set2[m1_idx,i] < set2[m2_idx,i]:
                                sampset_2 = np.delete(sampset_2,del_cnt_set2,axis=1)
                                del_cnt_set2-=1
                                break

                    del_cnt_set2+=1

                del_final_idx = np.min([del_cnt_set1,del_cnt_set2])
                set1 = sampset_1[:,:del_final_idx]
                set2 = sampset_2[:,:del_final_idx]

            else:

                set1 = sampset_1.T
                set2 = sampset_2.T
      
            
            # Iterate over number of randomized sample slices
            SMALL_CONSTANT = 1e-162 # 1e-4 works best for some reason
            def my_kde_bandwidth(obj, fac=1.0):

                """We use Scott's Rule, multiplied by a constant factor."""

                return np.power(obj.n, -1./(obj.d+4)) * fac
            if one_D:
                kl_result_all = np.zeros((1,len(self.params['inf_pars'])))
                for r in range(len(self.params['inf_pars'])):
                    if self.params['gen_indi_KLs'] == True:
                        p = gaussian_kde(set1[r],bw_method=my_kde_bandwidth)#'scott') # 7.5e0 works best ... don't know why. Hope it's not over-smoothing results.
                        q = gaussian_kde(set2[r],bw_method=my_kde_bandwidth)#'scott')#'silverman') # 7.5e0 works best ... don't know why.   
                        # Compute KL Divergence
                        log_diff = np.log((p(set1[r])+SMALL_CONSTANT)/(q(set1[r])+SMALL_CONSTANT))
                        kl_result = (1.0/float(set1.shape[1])) * np.sum(log_diff)

                        # compute symetric kl
                        anti_log_diff = np.log((q(set2[r])+SMALL_CONSTANT)/(p(set2[r])+SMALL_CONSTANT))
                        anti_kl_result = (1.0/float(set1.shape[1])) * np.sum(anti_log_diff)
                        kl_result_all[:,r] = kl_result + anti_kl_result
                    else:
                        kl_result_all[:,r] = 0   

                return kl_result_all
            else:
                kl_result = []
                set1 = set1.T
                set2 = set2.T
                for kl_idx in range(10):
                    rand_idx_kl = np.random.choice(np.linspace(0,set1.shape[0]-1,dtype=np.int),size=100)
                    kl_result.append(estimate(set1[rand_idx_kl,:],set2[rand_idx_kl,:]) + estimate(set2[rand_idx_kl,:],set1[rand_idx_kl,:]))
                kl_result = np.mean(kl_result)

                return kl_result