Beispiel #1
0
def gethistograms(novice_array, intermediary_array, expert_array):

    manip = 2
    span = int(novice_array.shape[1] / 2)
    kldiff = list()
    jsdiff = list()
    #print (expert_array.shape)
    #print (intermediary_array.shape)
    #print (novice_array.shape)
    common_range, grid = findrange(expert_array, intermediary_array)

    intermediary_kernel = (ss.gaussian_kde(intermediary_array.T)
                           )  #.evaluate([2, 2, 2, 2]))
    intermediary_values = intermediary_kernel.evaluate(grid)

    expert_kernel = (ss.gaussian_kde(expert_array.T)
                     )  #.evaluate([2, 2, 2, 2]))
    expert_values = expert_kernel.evaluate(grid)

    novice_kernel = (ss.gaussian_kde(novice_array.T)
                     )  #.evaluate([2, 2, 2, 2]))
    novice_values = novice_kernel.evaluate(grid)

    js_diff_ne = distance.jensenshannon(novice_values, expert_values)
    #print ("novice expert jsdiff {}".format(js_diff_ne))
    js_diff_ie = distance.jensenshannon(intermediary_values, expert_values)
    #print ("intermediary expert jsdiff {}".format(js_diff_ie))

    js_diff_ni = distance.jensenshannon(novice_values, intermediary_values)
    #print ("novice intermediary jsdiff {}".format(js_diff_ni))
    return js_diff_ne, js_diff_ie, js_diff_ni
Beispiel #2
0
def check_constraints(sample, probability_matrix):
    s_in, s_out = empirical_strengths(sample.full.edges,
                                      sample.full.nodes,
                                      marginalized=True)
    """
    Compute deviation of s_in/s_out from expected values.
    Return dict with MAE and weighted MAPE.
    """
    _s_in = probability_matrix.sum(axis=1)
    _s_out = probability_matrix.sum(axis=0)
    corr_in, pv_in = pearsonr(s_in, _s_in)
    corr_out, pv_out = pearsonr(s_out, _s_out)
    spcorr_in, sp_pv_in = spearmanr(s_in, _s_in)
    spcorr_out, sp_pv_out = spearmanr(s_out, _s_out)
    return dict(
        s_in_mae=mean_absolute_error(s_in, _s_in),
        s_out_mae=mean_absolute_error(s_out, _s_out),
        s_in_mape=mean_absolute_percentage_error(s_in, _s_in),
        s_out_mape=mean_absolute_percentage_error(s_out, _s_out),
        # r2_in=r2_score(s_in, _s_in),
        # r2_out=r2_score(s_out, _s_out),
        s_in_js=jensenshannon(s_in, _s_in, base=2)**2,
        s_out_js=jensenshannon(s_out, _s_out, base=2)**2,
        corr_in=corr_in,
        corr_out=corr_out,
        spcorr_in=spcorr_in,
        spcorr_out=spcorr_out,
        pv_in=pv_in,
        pv_out=pv_out,
        sp_pv_in=sp_pv_in,
        sp_pv_out=sp_pv_out,
    )
Beispiel #3
0
def getJS_mn(seqs, m, n, I, J, P_j, k, p, alphabet, inv=False):
    #Function that calculates the Jensen-Shannon divergence between k-mer distributions starting at position j
    #and all other positions after j
    #input parameters:
    #seqs = list of lists of all input sequences
    #m = position index 1 for the pair
    #n = position index 2 for the pair
    #I = total number of sequences
    #J = length of sequences
    #P = singe site k-mer frequencies
    #k = k-mer length
    #p = pseudocount mass
    #alphabet = alphabet used
    JS = 0.0
    JS_mn = {}  #dictionary containing all individual contributions to JS
    kmers = set()
    for kmer_m in P_j[m]:
        kmers.add(kmer_m)
    for kmer_n in P_j[n]:
        kmers.add(kmer_n)
    probs_m = []
    probs_n = []
    for kmer in kmers:
        if kmer in P_j[m]: probs_m.append(P_j[m][kmer])
        else: probs_m.append(0.0)
        if kmer in P_j[n]: probs_n.append(P_j[n][kmer])
        else: probs_n.append(0.0)
    if not inv:
        JS = jensenshannon(probs_m, probs_n,
                           2)  #using base 2 for logarithm for bits
    else:
        JS = 1 - jensenshannon(
            probs_m, probs_n,
            2)  #inverse of JS distance (high values mean similarity)
    return [(m, n), JS, JS_mn, None]
Beispiel #4
0
def JSdistance(p, q, base=None):
    """
    calculates the Jenson-Shannon distance between two distributions
    
    Input:
    p: a list of floats or numpy array specifying a pdf or pmf
    q: a list of floats or numpy array specifying a pdf or pmf
    base: optional argument specifying the base of the logarithm
    """
    if base is not None:
        output = distance.jensenshannon(p, q, base)
    else:
        output = distance.jensenshannon(p, q)
    return output
Beispiel #5
0
def js_metric(df1, df2, numerical_columns, categorical_columns):
    res = {}
    STEPS = 100
    for col in categorical_columns:
        col_baseline = df1[col].to_frame()
        col_sample = df2[col].to_frame()
        col_baseline["source"] = "baseline"
        col_sample["source"] = "sample"

        col_ = pd.concat([col_baseline, col_sample], ignore_index=True)

        arr = (col_.groupby([col,
                             "source"]).size().to_frame().reset_index().pivot(
                                 index=col,
                                 columns="source").droplevel(0, axis=1))
        arr_ = arr.div(arr.sum(axis=0), axis=1)
        arr_.fillna(0, inplace=True)
        js_distance = jensenshannon(arr_["baseline"].to_numpy(),
                                    arr_["sample"].to_numpy())
        res.update({col: js_distance})

    for col in numerical_columns:
        # fit guassian_kde
        col_baseline = df1[col]
        col_sample = df2[col]
        kde_baseline = gaussian_kde(col_baseline)
        kde_sample = gaussian_kde(col_sample)

        # get range of values
        min_ = min(col_baseline.min(), col_sample.min())
        max_ = max(col_baseline.max(), col_sample.max())
        range_ = np.linspace(start=min_, stop=max_, num=STEPS)

        # sample range from KDE
        arr_baseline_ = kde_baseline(range_)
        arr_sample_ = kde_sample(range_)

        arr_baseline = arr_baseline_ / np.sum(arr_baseline_)
        arr_sample = arr_sample_ / np.sum(arr_sample_)

        # calculate js distance
        js_distance = jensenshannon(arr_baseline, arr_sample)

        res.update({col: js_distance})

    list_output = sorted(res.items(), key=lambda x: x[1], reverse=True)
    dict_output = dict(list_output)
    return dict_output
Beispiel #6
0
    def score(self, X, y=None, j_ways=1):
        """Return jensen-shannon distance

           Parameters
           ----------
            X : synth array, shape(n_samples, n_features)
                The data.
            y : None
                Ignored variable.

           Returns
           -------
        """
        # marginal_domain = get_marginal_domain(y, j_ways)
        # marginal_counts_X = count_marginals(X, marginal_domain)
        # marginal_counts_y = count_marginals(y, marginal_domain)
        #
        # synth_hist, synth_bins = sts.contingency_table(X, epsilon=np.inf(float))
        self.feature_distances = {}
        average_feature_distance = np.empty_like(X.columns)
        for i, c in enumerate(X.columns):
            counts_X, counts_y = X[c].value_counts(dropna=False).align(y[c].value_counts(dropna=False), join='outer',
                                                                       axis=0, fill_value=0)
            js_distance = jensenshannon(counts_X, counts_y)
            average_feature_distance[i] = js_distance
            self.feature_distances[c] = js_distance
        average_feature_distance = np.sum(average_feature_distance) / len(X.columns)
        return average_feature_distance
Beispiel #7
0
def degree_js(dataset, model, agg):
    rows = {'dataset': [], 'model': [], 'trial': [], 'gen': [], 'degree_js': []}

    for trial in agg.keys():
        dist1 = agg[trial][0]

        for gen in agg[trial].keys():
            if gen != 0:
                dist2 = agg[trial][gen]
                union = set(dist1.keys()) | set(dist2.keys())

                for key in union:
                    dist1[key] = dist1.get(key, 0)
                    dist2[key] = dist1.get(key, 0)

                deg1 = np.asarray(list(dist1.values())) + 0.00001
                deg2 = np.asarray(list(dist2.values())) + 0.00001

                deg_js = distance.jensenshannon(deg1, deg2, base=2.0)

                rows['dataset'].append(dataset)
                rows['model'].append(model)
                rows['trial'].append(trial)
                rows['gen'].append(gen)
                rows['degree_js'].append(deg_js)
    return rows
def jensen_shannon(query, matrix):
    p = query[:, None]  # original shape of query was (100,) , which means ---> (number of topics,)
    print(p.shape)  # shape becomes (100,1)

    q = matrix.T  # transpose matrix
    print(q.shape)  # shape --> (number of topics, total documents)
    return jensenshannon(p, q)
Beispiel #9
0
def calc_jsd(args, X, G, dist):
    print("evaluating JSD")
    G.eval()

    bins = [np.arange(-1, 1, 0.02), np.arange(-1, 1, 0.02), np.arange(-1, 1, 0.01)]
    N = len(X)

    jsds = []

    for j in tqdm(range(10)):
        gen_out = utils.gen(args, G, dist=dist, num_samples=args.batch_size).cpu().detach().numpy()
        for i in range(int(args.num_samples / args.batch_size)):
            gen_out = np.concatenate((gen_out, utils.gen(args, G, dist=dist, num_samples=args.batch_size).cpu().detach().numpy()), 0)
        gen_out = gen_out[:args.num_samples]

        sample = X[rng.choice(N, size=args.num_samples, replace=False)].cpu().detach().numpy()
        jsd = []

        for i in range(3):
            hist1 = np.histogram(gen_out[:, :, i].reshape(-1), bins=bins[i], density=True)[0]
            hist2 = np.histogram(sample[:, :, i].reshape(-1), bins=bins[i], density=True)[0]
            jsd.append(jensenshannon(hist1, hist2))

        jsds.append(jsd)

    return np.mean(np.array(jsds), axis=0), np.std(np.array(jsds), axis=0)
Beispiel #10
0
def calculate_all_results(demos, approach):
    final_salads_demo, avg_length_demo = calculate_results_for_demos(demos)
    final_salads, avg_length = calculate_results_for_approach(
        demos, approach, final_salads_demo)

    for k in final_salads.keys():
        if k not in final_salads_demo.keys():
            final_salads_demo[k] = 0

    demo_dist = []
    approach_dist = []
    demo_count = 0
    approach_count = 0
    for k in final_salads_demo.keys():
        approach_count += final_salads[k]
        demo_count += final_salads_demo[k]
        if k != 'invalid':
            approach_dist.append(final_salads[k])
            demo_dist.append(final_salads_demo[k])
    approach_dist.append(final_salads['invalid'])
    demo_dist.append(final_salads_demo['invalid'])

    for i in range(len(approach_dist)):
        approach_dist[i] /= float(approach_count)

    for i in range(len(demo_dist)):
        demo_dist[i] /= float(demo_count)

    # print()
    # print(demo_dist)
    # print(approach_dist)
    print(approach_dist[len(approach_dist) - 1],
          distance.jensenshannon(demo_dist, approach_dist),
          avg_length - avg_length_demo)
def compute_jensen_shannon_divergence(vec1, vec2):
    print(vec1)
    print(vec2)
    vec1 = np.round(vec1, 2)
    vec2 = np.round(vec2, 2)
    min_vec1 = min(vec1)
    min_vec2 = min(vec2)
    max_vec1 = max(vec1)
    max_vec2 = max(vec2)
    mini = min_vec1
    maxi = max_vec1
    if min_vec1 > min_vec2:
        mini = min_vec2
    if max_vec1 < max_vec2:
        maxi = max_vec2
    b = []
    i = mini
    while i <= maxi + 0.1:
        b.append(i)
        i = i + 0.1
    print('Bins: ', b)
    p = np.histogram(vec1, bins=b)[0] / len(vec1)
    print(np.histogram(vec1, bins=b)[0])
    q = np.histogram(vec2, bins=b)[0] / len(vec2)
    print(np.histogram(vec2, bins=b)[0])
    print(p)
    print(q)
    score = distance.jensenshannon(p, q)**2
    return score
Beispiel #12
0
    def _penalty_components(self, actual, predicted):
        """ Score one row of counts for a particular (neighborhood, year, month) """
        if (actual == predicted).all():
            return 0.0, 0.0, 0.0

        # get the overall penalty for bias
        bias_mask = np.abs(actual.sum() -
                           predicted.sum()) > self.allowable_raw_bias
        bias_penalty = self.bias_penalty if bias_mask.any() else 0

        # zero out entries below the threshold
        gt = self._zero_below_threshold(actual).ravel()
        dp = self._zero_below_threshold(predicted).ravel()

        # get the base Jensen Shannon distance; add a tiny bit of weight to each bin in order
        # to avoid all-zero arrays (and thus NaNs) without unduly influencing the distribution
        # induced by normalizing the arrays (dividing by sums); use base 2 to get a proper
        # spread of scores between [0, 1] instead of default base e which is more like [0, 0.83]
        #
        # ref: https://docs.scipy.org/doc/scipy-1.5.2/reference/generated/scipy.spatial.distance.jensenshannon.html
        jsd = jensenshannon(gt + 1e-9, dp + 1e-9, base=2)

        # get the overall penalty for the presence of misleading counts
        misleading_presence_mask = (gt == 0) & (dp > 0)
        misleading_presence_penalty = (misleading_presence_mask.sum() *
                                       self.misleading_presence_penalty)

        return jsd, misleading_presence_penalty, bias_penalty
def js_bootstrap(key, set_1, set_2, nsamples, ntests):
    '''
    key: string posterior parameter
    set_1: first full posterior samples set
    set_2: second full posterior samples set
    nsamples: number for downsampling full sample set
    ntests: number of iterations over different nsamples realisations
    returns: 1 dim array (ntests)
    '''
    js_array = np.zeros(ntests)
    for j in tqdm(range(ntests)):
        nsamples = min([nsamples, len(set_1[key]), len(set_2[key])])
        lp = np.random.choice(set_1[key], size=nsamples, replace=False)
        bp = np.random.choice(set_2[key], size=nsamples, replace=False)
        x = np.atleast_2d(np.linspace(np.min([np.min(bp), np.min(lp)]),np.max([np.max(bp), np.max(lp)]),100)).T
        xlow = np.min(x)
        xhigh = np.max(x)
        if key in default_bounds.keys():
            bounds = default_bounds[key]
            if "low" in bounds.keys():
                xlow = bounds["low"]
            if "high" in bounds.keys():
                if isinstance(bounds["high"], str) and "mass_1" in bounds["high"]:
                    xhigh = np.max(x)
                else:
                    xhigh = bounds["high"]
        set_1_pdf = Bounded_1d_kde(bp, xlow=xlow, xhigh=xhigh)(x)
        set_2_pdf = Bounded_1d_kde(lp, xlow=xlow, xhigh=xhigh)(x)
        js_array[j] = np.nan_to_num(np.power(jensenshannon(set_1_pdf, set_2_pdf), 2))
    return js_array
 def predict(self, data, embedding_epochs_labeled=None):
     labels = self.gmm.predict(data)
     epoch_labels = set(embedding_epochs_labeled)
     sense_frequencies = self.compute_cluster_sense_frequency(labels, embedding_epochs_labeled, epoch_labels)
     task_1_answer = int(any([True for sd in sense_frequencies if 0 in sense_frequencies[sd]]))
     task_2_answer = distance.jensenshannon(sense_frequencies[0], sense_frequencies[1], 2.0)
     return task_1_answer, task_2_answer
def get_k_nearest_docs(doc_dist, k=5):
    distances = topic_dist.apply(lambda x: jensenshannon(x, doc_dist), axis=1)
   # else:
            #diff_df = topic_dist.sub(doc_dist)
            #distances = np.sqrt(np.square(diff_df).sum(axis=1)) # euclidean distance 
        
    return distances[distances != 0].nsmallest(n=k).index
Beispiel #16
0
def calc_kl(df, pop, stat, col2):
    """Compare two prob distributions.
    
    https://machinelearningmastery.com/divergence-between-probability-distributions/

    Parameters
    ----------
    df : TYPE
        DESCRIPTION.
    pop : TYPE
        DESCRIPTION.
    stat : TYPE
        DESCRIPTION.
    col2 : TYPE
        DESCRIPTION.

    Returns
    -------
    None.

    """
    dfpop = df[df["pops"] == pop]
    stats_list = dfpop[stat].unique()
    for i in stats_list:
        obs = dfpop[(dfpop[stat] == i) & (dfpop["df_id"] == "obs")]
        sim = dfpop[(dfpop[stat] == i) & (dfpop["df_id"] == "sim")]
        ent = sum(rel_entr(obs[col2].values, sim[col2].values))
        kl = sum(kl_div(obs[col2].values, sim[col2].values))
        js = jensenshannon(obs[col2].values, sim[col2].values, base=2)
        print(f"{stat} {i}: rel_entr {ent}, KL_div {kl}, js_dist {js} bits")
Beispiel #17
0
 def _recurse_spn_local_rules(node, ):
     #first bottom up:
     for c in node.children:
         if isinstance(c, Sum) or isinstance(c, Product):
             # yield from _recurse_spn_local_rules(c)
             for e in _recurse_spn_local_rules(c):
                 yield e
     #then local rule
     if node.id != 0:
         local_targets = set(target_vars).intersection(set(node.scope))
         for target in local_targets:
             p = p_from_scope(node, target, value_dict)
             # if categorical TODO non-categorical data?
             if max(p) >= self.min_local_p:
                 # prior = prior_dist[leaf.scope[0]]
                 # prior = [1 - prior, prior]7
                 prior = self.prior_gen.calculate_prior(
                     spn, target, value_dict)
                 js = jensenshannon(
                     p,
                     prior,
                 )
                 if js >= self.min_target_js:
                     other_vars = node.scope.copy()
                     other_vars.pop(other_vars.index(target))
                     # yield from _leaves_target_allrules(node, target, other_vars, value_dict, root=spn, targetp=p)
                     for res in _leaves_target_allrules(node,
                                                        target,
                                                        other_vars,
                                                        value_dict,
                                                        root=spn,
                                                        targetp=p):
                         yield res
Beispiel #18
0
def calculate_label_distribution_similarity(x: List[Example],
                                            y: List[Example]) -> float:
    """Calculate the similarity of the label distribution for 2 datasets.
    
    e.g. This can help you understand how well your train set models your dev and test sets.
    Empircally you want a similarity over **0.8** when comparing your train set to each of your
    dev and test sets.

        calculate_label_distribution_similarity(corpus.train, corpus.dev)
        # 98.57

        calculate_label_distribution_similarity(corpus.train, corpus.test)
        # 73.29 - This is bad, let's investigate our test set more
    
    Args:
        x (List[Example]): Dataset
        y (List[Example]): Dataset to compare x to
    
    Returns:
        float: Similarity of label distributions
    """
    def pipeline(data: List[Example]) -> Sequence[float]:
        stats = cast(NERStats, get_ner_stats(data))
        sorted_type_counts = get_sorted_type_counts(stats)
        counts_to_probs = get_probs_from_counts(sorted_type_counts)
        return counts_to_probs

    distance = jensenshannon(pipeline(x), pipeline(y))

    return (1 - distance) * 100
Beispiel #19
0
def best_match_v3(input_title, precalc_vectors: np.ndarray, title_names_pool):
    input_title = pre_process(input_title)
    bb = BertEncoder()
    vec = bb.bert_encoder(input_title)

    trim_n = 300  #precalc_vectors.shape[1]
    scores = []
    wass_scores = []
    jsh_scores = []
    for i in range(precalc_vectors.shape[0]):
        s_d = cosine(vec[:trim_n], precalc_vectors[i][:trim_n])
        w_d = wasserstein_distance(vec[:trim_n], precalc_vectors[i][:trim_n])
        jsh_d = jensenshannon(np.abs(vec[:trim_n]),
                              np.abs(precalc_vectors[i][:trim_n]))
        scores.append(s_d)
        wass_scores.append(w_d)
        jsh_scores.append(jsh_d)
    # take top N elements
    top_n = 3
    idxs = np.argpartition(scores, -top_n)[-top_n:]
    idxs2 = np.argpartition(wass_scores, top_n)[:top_n]
    idxs3 = np.argpartition(jsh_scores, top_n)[:top_n]
    title_names_pool = np.array(title_names_pool)
    scores = np.array(scores)
    wass_scores = np.array(wass_scores)
    jsh_scores = np.array(jsh_scores)
    res = [
        x for x in
        zip(title_names_pool[idxs], scores[idxs], title_names_pool[idxs2],
            wass_scores[idxs2], title_names_pool[idxs3], jsh_scores[idxs3])
    ]
    print(res)
    return res
Beispiel #20
0
def JSD(P, Q, idx, n_dims, return_dict, reshape=False):
    if reshape:
        P, Q = np.reshape(P,
                          (-1, int(P.shape[1] / n_dims), n_dims)), np.reshape(
                              Q, (-1, int(Q.shape[1] / n_dims), n_dims))
        return_dict[idx] = [
            np.mean([
                distance.jensenshannon(P[n, :, m], Q[n, :, m])
                for m in np.arange(n_dims)
            ]) for n in np.arange(idx[0], idx[1])
        ]
    else:
        return_dict[idx] = [
            distance.jensenshannon(P[n, :], Q[n, :])
            for n in np.arange(idx[0], idx[1])
        ]
Beispiel #21
0
    def score(self, original_data, synthetic_data, score_dict=False):
        """Calculate jensen_shannon distance between original and synthetic data.
        Look for more elaborate evaluation techniques in synthesis.evaluation.

        Parameters
        ----------
        original_data: pandas.DataFrame
            Original data that was seen in fit
        synthetic_data: pandas.DataFrame
            Synthetic data that was generated based original_data
        score_dict: bool
            If true, will return jensen_shannon scores of each column individually
        Returns
        -------
        average jensen_shannon distance: float
            Average jensen_shannon distance over all columns
        per-column jensen_shannon distance (if score_dict): dict
            Per-column jensen_shannon distance
        """
        column_distances = {}
        for i, c in enumerate(original_data.columns):
            # compute value_counts for both original and synthetic - align indexes as certain column
            # values in the original data may not have been sampled in the synthetic data
            counts_original, counts_synthetic = \
                original_data[c].value_counts(dropna=False).align(
                    synthetic_data[c].value_counts(dropna=False), join='outer', axis=0, fill_value=0
                )

            js_distance = jensenshannon(counts_original, counts_synthetic)
            column_distances[c] = js_distance
        average_column_distance = sum(column_distances.values()) / len(original_data.columns)
        if score_dict:
            return average_column_distance, column_distances
        return average_column_distance
def show_following_distribution(filename, ids_to_plot):
    user_file = filename
    artist_db = Database_Instance(user_file)

    distributions = []

    for curr in ids_to_plot.keys():
        try:
            res = [a[1] for a in artist_db.query('''
                SELECT id, followings_count FROM follower_data_{} where followings_count < 500
            '''.format(curr))]

            n, bins, patches = plt.hist(res, bins=50)

            if len(normal) != 0:
                plt.title('Following Count Distribution for the Followers of Artist {}'.format(str(ids_to_plot[curr])))
                plt.ylabel('Frequency')
                plt.xlabel('Following Count')
                plt.savefig('./images/BadArtist{}.png'.format(str(ids_to_plot[curr])))

                # Calculate Jensen-Hannon Divergence
                jh = math.sqrt(jensenshannon(normal[0], n))
                plt.annotate("JH: " + str(jh)[:8], xy=(0.7, 0.9), xycoords='axes fraction', fontsize=14, color='purple')
                plt.savefig('./images/JH_GoodArtist{}.png'.format(str(ids_to_plot[curr])))
                #plt.show()

            plt.clf()
            distributions.append(n)
        except:
            print("ID " + str(ids_to_plot) + " given possibly not in DB")
    return bins, distributions
def jensen_shannon_divergence(p, q, base=2):
    """
    Return the Jensen-Shannon divergence between two 1-D arrays.

    Parameters
    ---------
    p : array
        left probability array
    q : array
        right probability array
    base : numeric, default 2
        logarithm base

    Returns
    -------
    jsd : float
        divergence of p and q
    """
    # If the sum of probability array p or q does not equal to 1, then normalize
    p = np.asarray(p)
    q = np.asarray(q)
    p = p / np.sum(p, axis=0)
    q = q / np.sum(q, axis=0)
    from scipy.spatial.distance import jensenshannon
    return round(jensenshannon(p, q, base=base), 4)
Beispiel #24
0
def jsd(a, b):
    for batchid in range(len(a)):
        for rowid in range(len(a[batchid])):
            print(a[batchid][rowid][:-1], b[batchid][rowid][:-1])
            jsd = distance.jensenshannon(a[batchid][rowid][:-1],
                                         b[batchid][rowid][:-1])
            print(jsd**2)
Beispiel #25
0
def H_metric(H1, H2, mode='chi2'):
    '''Histogram metrics.

    H1, H2 -- 1d histograms with matched bins.
    mode -- Metric to use.

    Modes:
        l2 -- Euclidean distance
        l1 -- Manhattan distance
        vcos -- Vector cosine distance
        intersect -- Histogram intersection distance
        chi2 -- Chi square distance
        jsd -- Jensen-Shannan Divergence
        emd -- Earth Mover's Distance
    '''

    if mode == 'l2':
        return np.linalg.norm(H1 - H2, ord=2)
    if mode == 'l1':
        return np.linalg.norm(H1 - H2, ord=1)
    if mode == 'vcos':
        return vcos_dist(H1, H2)
    if mode == 'intersect':
        return np.sum(np.min(np.stack((H1, H2)), axis=0))
    if mode == 'chi2':
        a = 2 * ((H1 - H2)**2).astype(float)
        b = H1 + H2
        return np.sum(np.divide(a, b, out=np.zeros_like(a), where=b != 0))
    if mode == 'jsd':
        return jensenshannon(H1, H2)
    if mode == 'emd':
        return wasserstein_distance(H1, H2)

    raise NotImplementedError()
Beispiel #26
0
def KDE_estimates(positive,negative,njobs):
    #Implements the KDE estimation procedure separately for the driver (positive) and passenger (negative) data and also calculates the JS distances
    """
    Arguments:
        positive=The feature matrix for driver mutations
        negative=The feature matrix for passenger mutations
        njobs=Number of jobs to run in parallel for the Grid Search
    Returns:
        js_t= Jensen-Shannon distance between the estimated densities
    """
    np.random.seed(333)
    bandwidths = np.logspace(-1, 1, 30)
    grid_pos = GridSearchCV(KernelDensity(kernel='gaussian'),
                    {'bandwidth': bandwidths},
                    cv=3,n_jobs=njobs)
    grid_pos.fit(positive)
    kde_pos = KernelDensity(kernel='gaussian', bandwidth = grid_pos.best_params_['bandwidth']).fit(positive)
    grid_neg = GridSearchCV(KernelDensity(kernel='gaussian'),
                    {'bandwidth': bandwidths},
                    cv=3,n_jobs=njobs)
    grid_neg.fit(negative)
    kde_neg = KernelDensity(kernel='gaussian', bandwidth = grid_neg.best_params_['bandwidth']).fit(negative)
    score_pos=kde_pos.score_samples(positive)
    score_neg=kde_neg.score_samples(negative)
    js_t=distance.jensenshannon(np.exp(score_pos),np.exp(score_neg))
    return js_t
Beispiel #27
0
def obj_jensen_shannon(index, fitness, observation, new_pop, envs, args):
    bins = [i for i in range(-5, 5)]

    fit_distribution = list()
    for i in range(len(envs)):
        fit_distribution.append(fitness[i][index])
    fit_distribution = np.array(fit_distribution)

    if len(fit_distribution) <= 1:
        return 0

    esp = fit_distribution.mean()
    std = fit_distribution.std()
    fit_distribution = (fit_distribution - esp) / (std + 1e-8)

    fit_distribution_de = np.histogram(fit_distribution,
                                       bins=bins,
                                       density=True)[0]

    normal_bins = [
        norm.cdf(bins[i + 1]) - norm.cdf(bins[i])
        for i in range(len(bins) - 1)
    ]

    return -jensenshannon(fit_distribution_de, normal_bins, base=2)
Beispiel #28
0
    def run_summary_operations(x: np.ndarray, y: np.ndarray) -> Dict:
        kl_div = stats.entropy(x, y)
        js_div = jensenshannon(x, y, base=2)

        try:
            co_int = co_integration(x, y)
        except MissingDataError:
            co_int = np.nan

        covar = covariance(x, y)

        try:
            corr_p, _ = correlation(x, y, 'pearson')
        except ValueError:
            corr_p = np.nan

        corr_s, _ = correlation(x, y, 'spearman')
        corr_k, _ = correlation(x, y, 'kendall')

        feature_set = \
            dict(kl_div=kl_div,
                 js_div=js_div,
                 co_int=co_int,
                 covar=covar,
                 corr_p=corr_p,
                 corr_s=corr_s,
                 corr_k=corr_k)

        return feature_set
Beispiel #29
0
  def compute_with_metadata(
      self,
      labels: Sequence[Any],
      preds: Sequence[Any],
      label_spec: types.LitType,
      pred_spec: types.LitType,
      indices: Sequence[types.ExampleId],
      metas: Sequence[JsonDict],
      config: Optional[JsonDict] = None) -> Dict[Text, float]:
    del labels  # Unused; we only care about preds.
    del label_spec  # Unused; we only care about preds.

    ret = collections.OrderedDict()

    pairs = self.find_pairs(indices, metas)
    ret['num_pairs'] = len(pairs)
    if ret['num_pairs'] == 0:
      return {}

    pred_idxs = get_classifications(preds, pred_spec, config)

    # 'swapped' just means the prediction changed.
    is_swapped = [(pred_idxs[i] == pred_idxs[j]) for i, j in pairs]
    ret['swap_rate'] = 1 - np.mean(is_swapped)

    # Jensen-Shannon divergence, as a soft measure of prediction change.
    jsds = [
        scipy_distance.jensenshannon(preds[i], preds[j])**2 for i, j in pairs
    ]
    ret['mean_jsd'] = np.mean(jsds)

    return ret
Beispiel #30
0
 def jsdivergence(self, *arg, background=BLOSUM62_BG):
     # calculates jensen shannon divergence of each column to the background; will yield bidirectional equality
     # if argument is given, compares each column of the 2 alignments, number of columns must be same in both
     assert all(self.is_position())
     if len(arg) > 1:
         raise Exception(
             'Provide only 1 argument to be compared against or none to compare against the BLOSUM62 background'
         )
     elif len(arg) == 1:
         assert arg[0].shape[1], self.shape[1]
         b = self.probability(alphabet=background[0])[0].T
         q = arg[0].probability(alphabet=background[0])[0].T
         return np.array([jensenshannon(*i) for i in zip(b, q)])
     elif len(arg) == 0:
         p = self.probability(alphabet=background[0])[0].T
         return np.array([jensenshannon(i, background[1]) for i in p])