def evaluate_distibution(event_name, words, session, ids):
    """
    Do a complete evaluation of the distribution of words for an event. Compute Jensen-Shannon
    and  Jaccard Index
    :param event_name: Name of the event to be evaluated
    :return:
    """
    print(event_name, words)
    words_event, distribution_event, pairs_event = calculate_distribution_event(event_name, session, ids, True)
    path_references = Path(LOCAL_DATA_DIR_2, 'data', event_name, 'summaries', 'reference')
    references_list = [reference for reference in path_references.iterdir() if reference.is_file()]
    event_dist = dit.ScalarDistribution(words_event, distribution_event)
    words_set_event = set(words_event[:words])
    print('Most Common words in event: {}'.format(words_set_event))
    total_dist, all_words = global_distribution(references_list)
    all_words_set = set(all_words[:words])
    jaccard = len(words_set_event.intersection(all_words_set)) / len(words_set_event.union(all_words_set))
    print('Most Common words in all timelines: {}'.format(all_words_set))
    print('Jaccard Index with all timelines: {}'.format(jaccard))
    print('Jensen-Shannon with all timelines: {}'.format(jensen_shannon_divergence([total_dist, event_dist])))
    for reference in references_list:
        words_timeline, probs_timeline, pairs_timeline = calculate_distribution_timeline(event_name, reference)
        dist_timeline = dit.ScalarDistribution(words_timeline, probs_timeline)
        print('----------------------------')
        word_set_timeline = set(words_timeline[:words])
        print(reference.name)
        print('Most Common words in timeline: {}'.format(word_set_timeline))
        print('Jensen-Shannon: {}'.format(jensen_shannon_divergence([dist_timeline, event_dist])))
        jaccard = len(words_set_event.intersection(word_set_timeline)) / len(words_set_event.union(word_set_timeline))

        print('Jaccard Index: {}'.format(jaccard))
Beispiel #2
0
def JSD_pos_dataframe(matrix):
    """
    Computes the Jensen Shannon Divergence from the pos distributions

    @args :
    - the statfile with pos distribution

    @output :
    - a dataframe with distance for every permutations (including fr/fr)
    """
    test = dict()

    langues = matrix.index.tolist()
    pos = matrix.columns.tolist()

    for l1, l2 in itertools.product(sorted(langues), repeat=2):  # AA AB BA BB
        distrib_1 = matrix.loc[l1].tolist()
        distrib_2 = matrix.loc[l2].tolist()
        X = dit.ScalarDistribution(pos, distrib_1)
        Y = dit.ScalarDistribution(pos, distrib_2)
        JS = dit.divergences.jensen_shannon_divergence([X, Y])
        # print("l1 {}\tl2 {}\t Jensen-Shannon Divergence {}\n".format(l1, l2, JS))

        if l1 in test:
            test[l1][l2] = JS
        else:
            test[l1] = dict()
            test[l1][l2] = JS
    #
    df = pandas.DataFrame(test)
    return df
Beispiel #3
0
 def fitness(individual, data):
     individual = vector_to_dna(individual)
     # fitness = np.linalg.norm(target - vector(individual))
     fitness = jensen_shannon_divergence([
         dit.ScalarDistribution(target),
         dit.ScalarDistribution(vector(individual))
     ])
     return fitness
Beispiel #4
0
    def mutual_info(self, sentence_a, sentence_b):
        """ Computing the manifold of metric of information
        Mutual information
        Joint Information
        Conditioned Information Loss
        Conditioned Information Noise
        Self-Information
        """
        vocab = self.vocab.copy()
        token_counts_1 = self.__get_cnts(sentence_a, vocab)
        token_counts_2 = self.__get_cnts(sentence_b, vocab)
        self.logging.info('token count processed')

        self.logging.info('vocab #'+ str(len(self.vocab.keys())))

        alphabet_source = list(set(token_counts_1.keys()))
        self.logging.info('alphabet_source #'+ str(len(alphabet_source)) )

        alphabet_target = list(set(token_counts_2.keys()))
        self.logging.info('alphabet_target #'+ str(len(alphabet_target)) )


        self.logging.info('diff src2tgt #'+ str(set(token_counts_1.keys()) - set(token_counts_2.keys())))
        self.logging.info('diff tgt2src #'+ str(set(token_counts_2.keys()) - set(token_counts_1.keys())))

        assert( len(alphabet_source) ==  len(alphabet_target) )

        #Computing Self-Information (or Entropy)
        scalar_distribution_source = dit.ScalarDistribution(alphabet_source, self.__get_freqs( token_counts_1 ) )
        entropy_source = dit.shannon.entropy( scalar_distribution_source )

        scalar_distribution_target = dit.ScalarDistribution(alphabet_target, self.__get_freqs( token_counts_2 ) )
        entropy_target = dit.shannon.entropy( scalar_distribution_target )

        #Computing Joint-information
        token_counts = { token: (token_counts_1[token] + token_counts_2[token]) for token in vocab }
        alphabet = list(set(token_counts.keys()))
        self.logging.info('alphabet #'+ str(len(alphabet)))
        frequencies = self.__get_freqs(token_counts)
        ##WARNING! if a document is empty frequencies might create an issue!
        scalar_distribution = dit.ScalarDistribution(alphabet, frequencies)
        joint_entropy = dit.shannon.entropy( scalar_distribution )

        #Computing Mutual-Information
        mutual_information = entropy_source + entropy_target - joint_entropy

        #Computing Noise
        noise = joint_entropy - entropy_target

        #Computing Loss
        loss = joint_entropy - entropy_source

        return [entropy_source, entropy_target, joint_entropy,
                mutual_information, loss, noise]
def create_distribution(event, reference_name, summary_name):
    path_reference = Path(LOCAL_DATA_DIR_2, 'data', event, 'summaries', 'reference', reference_name)
    path_summary = Path(LOCAL_DATA_DIR_2, 'data', event, 'summaries', 'system', summary_name)
    with path_reference.open('r') as reference, path_summary.open('r') as summary:
        reference_text = reference.read()
        summary_text = summary.read()
        summary_words, summary_probs, _ = calculate_vocab_distribution(summary_text)
        reference_words, reference_probs, _ = calculate_vocab_distribution(reference_text)
        summary_dist = dit.ScalarDistribution(summary_words, summary_probs)
        reference_dist = dit.ScalarDistribution(reference_words, reference_probs)
    return reference_dist, summary_dist
Beispiel #6
0
 def fitness(individual, data):
     individual = vector_to_dna(individual)
     if mode == "JSD":
         return jensen_shannon_divergence([
             dit.ScalarDistribution(target / len(k)),
             dit.ScalarDistribution(vector(individual) / len(k))
         ])
     elif mode == "ED":
         return np.linalg.norm(target - vector(individual))
     else:
         raise Exception("Fitness mode must be JSD or ED")
Beispiel #7
0
def calculate_jsd(input_values, input_probabilities):

    """
    Calculated Jensen-Shannon Divergence upon the table
    """
    processed_probabilities = scale_table(input_probabilities)
    x = dit.ScalarDistribution(amino_list, input_values, sample_space = amino_list, sort = True)
    jsd_values = []
    for row in processed_probabilities:
        y = dit.ScalarDistribution(amino_list, row, sample_space = amino_list, sort = True)
        jsd_values.append(jensen_shannon_divergence([x,y]))
    return jsd_values
Beispiel #8
0
    def msi(self, sentence_a, sentence_b):
        '''@danaderp
        Minimum Shared Information'''
        vocab = self.vocab.copy()
        token_counts_1 = self.__get_cnts(sentence_a, vocab)
        token_counts_2 = self.__get_cnts(sentence_b, vocab)
        self.logging.info('token count processed')
        #Minimum Shared Tokens
        token_counts = { token: min(token_counts_1[token],token_counts_2[token]) for token in vocab }

        alphabet = list(set(token_counts.keys())) #[ list(set(cnt.keys())) for cnt in token_counts ]
        frequencies = self.__get_freqs(token_counts) #[ get_freqs(cnt) for cnt in token_counts ]
        self.logging.info('frequencies processed')

        if not frequencies:
            #"List is empty"
            "nan Means that src and target do not share information at all"
            entropies = float('nan')
            extropies = float('nan')
            self.logging.info('FREQUENCIES NOT COMPUTED!!!<--------------')
        else:
            scalar_distribution = dit.ScalarDistribution(alphabet, frequencies) #[dit.ScalarDistribution(alphabet[id], frequencies[id]) for id in range( len(token_counts) )]
            self.logging.info('scalar_distribution processed')

            entropies = dit.shannon.entropy( scalar_distribution ) #[ dit.shannon.entropy( dist ) for dist in scalar_distribution ]
            self.logging.info('entropies processed')

            extropies = dit.other.extropy( scalar_distribution )# [ dit.other.extropy( dist ) for dist in scalar_distribution ]
            self.logging.info('extropies processed')
        return [entropies,extropies]
Beispiel #9
0
def DtoSD(dist, extract):
    """
    Convert a Distribution to a ScalarDistribution.

    Parameters
    ----------
    dist : Distribution
        The Distribution to convert to a ScalarDistribution.
    extract : bool
        If `True` and the outcome length is 1, then we extract the sole
        element from each outcome and use that value as the scalar outcome.

    """
    if extract and dist.outcome_length() == 1:
        outcomes = tuple(outcome[0] for outcome in dist.outcomes)
        sample_space = dist.alphabet[0]
    else:
        outcomes = dist.outcomes
        sample_space = None

    # If people really want it, we can use _make_distribution.
    # But we have to decide if we want to set the alphabet to the
    # entire sample or just the sample space represented in outcomes.
    d = dit.ScalarDistribution(outcomes,
                               dist.pmf,
                               sample_space=sample_space,
                               base=dist.get_base(),
                               prng=dist.prng,
                               sort=False,
                               sparse=dist.is_sparse(),
                               validate=False)

    return d
Beispiel #10
0
def calculate_Y(X, YgX, model="ising"):
    X.make_dense()
    Y_pmf = (X.pmf[:, np.newaxis] * YgX).sum(axis=0)
    outcomes = [-1, 1] if model == "ising" else [0, 1]
    Yd = {o: p for o, p in zip(outcomes, Y_pmf)}
    Y = dit.ScalarDistribution(Yd)
    return Y
Beispiel #11
0
def test_product_nonjoint():
    """
    Test product_distribution() from a ScalarDistribution.
    """
    d = dit.ScalarDistribution([.5, .5])
    with pytest.raises(Exception):
        dit.product_distribution(d)
Beispiel #12
0
def test_product_nonjoint():
    """
    Test product_distribution() from a ScalarDistribution.

    """
    d = dit.ScalarDistribution([.5, .5])
    assert_raises(Exception, dit.product_distribution, d)
Beispiel #13
0
def get_dist(token_counts):
    '''Takes in a counter object of token occurrences, computes the entropy of the corpus that produced it'''
    alphabet = list(set(token_counts.keys()))
    frequencies = get_freqs(token_counts)
    #     for token in token_counts:s
    #         frequencies.append((token_counts[token])/num_tokens)
    #     logging.info(f'alphabet size {len(alphabet)}, freq size {len(frequencies)} alphabet - {list(token_counts.keys())}')
    return dit.ScalarDistribution(alphabet, frequencies)
Beispiel #14
0
def calculate_XY(X, YgX, model="ising"):
    X.make_dense()
    a = X.alphabet
    outcomes = list(itertools.product(a, repeat=X.outcome_length() + 1))
    XY_pmf = X.pmf[:, np.newaxis] * YgX
    XYd = {o: p for o, p in zip(outcomes, XY_pmf)}
    XY = dit.ScalarDistribution(XYd)
    return XY
Beispiel #15
0
def get_dist(token_counts):
    '''Takes in a counter object of token occurrences, computes the entropy of the corpus that produced it'''
    num_tokens = sum(token_counts.values())
    outcomes = list(set(token_counts.elements()))
    frequencies = []
    for token in token_counts:
        frequencies.append((token_counts[token]) / num_tokens)

    return dit.ScalarDistribution(outcomes, frequencies)
Beispiel #16
0
def test_pruned_samplespace_scalar():
    """Prune a sample space from a ScalarDistribution."""
    pmf = [1 / 2, 0, 1 / 2]
    d = dit.ScalarDistribution(pmf)
    d2 = dit.algorithms.pruned_samplespace(d)
    ss2_ = [0, 2]
    ss2 = list(d2.sample_space())
    assert ss2 == ss2_
    assert np.allclose(d2.pmf, [1 / 2, 1 / 2])
Beispiel #17
0
def test_expanded_samplespace2():
    """Expand a sample space from a ScalarDistribution."""
    pmf = [1 / 2, 1 / 2]
    ss = [0, 1]
    d = dit.ScalarDistribution(pmf)
    assert list(d.sample_space()) == ss
    ss2 = [0, 1, 2]
    d2 = dit.algorithms.expanded_samplespace(d, ss2)
    assert list(d2.sample_space()) == ss2
Beispiel #18
0
Datei: i.py Projekt: LeyliG/ds4se
def dit_shannon(token_counts):
    num_tokens = 0
    for token in token_counts:
        num_tokens += token_counts[token]
    outcomes = list(set(token_counts.elements()))
    frequencies = []
    for token in token_counts:
        frequencies.append((token_counts[token]) / num_tokens)
    d = dit.ScalarDistribution(outcomes, frequencies)
    return dit.shannon.entropy(d)
Beispiel #19
0
def test_pruned_samplespace():
    """Prune a sample space from a Distribution."""
    outcomes = ['0', '1', '2']
    pmf = [1 / 2, 0, 1 / 2]
    d = dit.ScalarDistribution(outcomes, pmf)
    d2 = dit.algorithms.pruned_samplespace(d)
    ss2_ = ['0', '2']
    ss2 = list(d2.sample_space())
    assert ss2 == ss2_
    assert np.allclose(d2.pmf, [1 / 2, 1 / 2])
Beispiel #20
0
def test_pruned_samplespace2():
    """Prune a sample space while specifying a desired sample space."""
    outcomes = ['0', '1', '2', '3']
    pmf = [1 / 2, 0, 1 / 2, 0]
    ss2_ = ['0', '1', '2']
    d = dit.ScalarDistribution(outcomes, pmf)
    d2 = dit.algorithms.pruned_samplespace(d, sample_space=ss2_)
    # We must make it dense, since the zero element will not appear in pmf.
    d2.make_dense()
    ss2 = list(d2.sample_space())
    assert ss2 == ss2_
    assert np.allclose(d2.pmf, [1 / 2, 0, 1 / 2])
def global_distribution(references_list):
    """
    Calculate the distribution of words, considering the concatenation of all timelines for the event
    :param references_list: list with the name of the files with the timelines.
    :return: words, all words sorted by probability; distribution, probabilities of the words
            pairs, tuple of (word, probability)
    """
    total_reference = ''
    for reference in references_list:
        with reference.open() as f:
            total_reference = total_reference + f.read()
    words, probs, pairs = calculate_vocab_distribution(total_reference)
    total_distribution = dit.ScalarDistribution(words, probs)
    return total_distribution, words
def calc_stats(sequence, verbose):
    """
    INPUT:
        * sequence - Sequence sampled from seq_gen class
    OUTPUT:
        - Summary Statistics:
        * js_temp - Jensen-Shannon-Divergence between standard-between-dev
                    empirical distributions compared between regimes
    """

    sequence_sub = sequence[sequence[:, 2] != 0.5, :]
    deviants, regime_switches = find_deviants(sequence)

    # Catch trial/regime switch prob
    catch_prob = len(sequence[sequence[:, 2] == 0.5, 0]) / sequence.shape[0]
    switch_prob = regime_switches / sequence.shape[0]

    stim_prob_overall = len(sequence[sequence[:, 2] == 1, 2]) / (
        len(sequence[sequence[:, 2] == 1, 2]) +
        len(sequence[sequence[:, 2] == 0, 2]))

    # 0th Order Stimulus probability (empirical)
    stim_prob_reg0 = np.mean(sequence[sequence[:, 1] == 0, 2])
    stim_prob_reg1 = np.mean(sequence[sequence[:, 1] == 1, 2])

    # 1st Order Stimulus prob (empirical)
    alt_prob_reg0 = np.mean(deviants[deviants[:, 0] == 0, 1])
    alt_prob_reg1 = np.mean(deviants[deviants[:, 0] == 1, 1])

    # Empirical pmf of standards between deviants for both regimes
    reg_0_dev = deviants[deviants[:, 0] == 0, :]
    reg_1_dev = deviants[deviants[:, 0] == 1, :]

    # Average train-length per regime:
    avg_train_reg0 = (np.sum(deviants[deviants[:, 0] == 0, 3]) /
                      np.count_nonzero(deviants[deviants[:, 0] == 0, 3]))
    avg_train_reg1 = (np.sum(deviants[deviants[:, 0] == 1, 3]) /
                      np.count_nonzero(deviants[deviants[:, 0] == 1, 3]))

    # Time spent in Regimes
    trials_in_reg0 = deviants[deviants[:, 0] == 0, 0]
    time_reg0 = trials_in_reg0.shape[0] / deviants.shape[0]

    try:
        epmf_reg_0_dev = np.histogram(reg_0_dev[:, 3],
                                      bins=int(np.max(reg_0_dev[:, 3])),
                                      density=True)

        epmf_reg_1_dev = np.histogram(reg_1_dev[:, 3],
                                      bins=int(np.max(reg_1_dev[:, 3])),
                                      density=True)

        # Calculate symmetric Jensen - Shannon divergence
        d1 = dit.ScalarDistribution(epmf_reg_0_dev[1][:-1], epmf_reg_0_dev[0])
        d2 = dit.ScalarDistribution(epmf_reg_1_dev[1][:-1], epmf_reg_1_dev[0])
        js_temp = jensen_shannon_divergence([d1, d2])
    except:
        js_temp = None

    if verbose:
        print(
            "Empirical Probabilities: \n Empirical Catch Prob.: {} \n Empirical Regime Switch Prob.: {} \n Empirical Overall High-Intensity Stimulus Prob.: {} \n Empirical Regime 0 High-Intensity Stimulus Prob.: {} \n Empirical Regime 1 High-Intensity Stimulus Prob.: {} \n Empirical Regime 0 Alternation Prob.: {} \n Empirical Regime 1 Alternation Prob.: {}  \n JS Div. Deviant Waiting Time Distr. between Regimes: {} \n Time in Regime 0: {} \n Average Train Length in Regime 0: {} \n Average Train Length in Regime 1: {}"
            .format(catch_prob, switch_prob, stim_prob_overall, stim_prob_reg0,
                    stim_prob_reg1, alt_prob_reg0, alt_prob_reg1, js_temp,
                    time_reg0, avg_train_reg0, avg_train_reg1))
        print("--------------------------------------------")

    stats_out = {
        "emp_catch_prob": catch_prob,
        "emp_overall_sp": stim_prob_overall,
        "emp_reg0_sp": stim_prob_reg0,
        "emp_reg1_sp": stim_prob_reg1,
        "emp_reg0_ap": alt_prob_reg0,
        "emp_reg1_ap": alt_prob_reg1,
        "js_div": js_temp,
        "avg_train_r0": avg_train_reg0,
        "avg_train_r1": avg_train_reg1
    }
    return stats_out, reg_0_dev, reg_1_dev
Beispiel #23
0
def test_rvfunctions_scalardist():
    d = dit.ScalarDistribution(range(5), [1 / 5] * 5)
    assert_raises(ditException, dit.RVFunctions, d)
Beispiel #24
0
    rand_seq_1 = create_random_seq(sl)
    rand_seq_2 = create_random_seq(sl)

    #kmer_freqs_1 = k_mer_frequencies(rand_seq_1, k, include_missing = True)
    #kmer_freqs_2 = k_mer_frequencies(rand_seq_2, k, include_missing = True)

    #print(kmer_freqs_1)
    #print(kmer_freqs_2)

    vector_1 = vector(rand_seq_1, [k])
    vector_2 = vector(rand_seq_2, [k])

    eds.append(np.linalg.norm(vector_1 - vector_2))
    jsds.append(
        jensen_shannon_divergence([
            dit.ScalarDistribution(vector_1),
            dit.ScalarDistribution(vector_2)
        ]))

plt.figure()
plt.scatter(eds,
            jsds,
            edgecolor='black',
            linewidth='1',
            alpha=0.5,
            facecolor='green')
plt.xlabel('Euclidean Distance')
plt.ylabel('Jensen-Shannon Divergence')
plt.title('JSD vs. ED, k = ' + str(k) + ', seq_len = ' + str(sl))
plt.show()
Beispiel #25
0
def test_rvfunctions_scalardist():
    d = dit.ScalarDistribution(range(5), [1 / 5] * 5)
    with pytest.raises(ditException):
        dit.RVFunctions(d)
Kl_loss = nn.KLDivLoss(reduce=True, size_average=False)
JS_Div_loss = Kl_loss(torch.log(ensemble_probs), Mixture_dist)

## for this, using ln
JS_by_hand = 0.5*(prob1[0][0][0][0]*torch.log(prob1[0][0][0][0]/ensemble_probs[0][0][0][0])\
             +prob1[0][1][0][0]*torch.log(prob1[0][1][0][0]/ensemble_probs[0][1][0][0]) \
             + prob3[0][0][0][0] * torch.log(prob3[0][0][0][0] / ensemble_probs[0][0][0][0]) \
             +prob3[0][1][0][0]*torch.log(prob3[0][1][0][0]/ensemble_probs[0][1][0][0]))

print('implemented JS by pytorch: ', JS_Div_loss, ' , implemented by hands:',
      JS_by_hand)

# for this, using log2
import dit, numpy as np
from dit.divergences import jensen_shannon_divergence
X = dit.ScalarDistribution(['0', '1'], prob1.numpy().ravel())
Y = dit.ScalarDistribution(['0', '1'], prob3.numpy().ravel())
print('JS-div in log2:', jensen_shannon_divergence([X, Y]), ' , in ln: ',
      jensen_shannon_divergence([X, Y]) / np.log(2) * np.log(np.e))

## image examples:

prob1 = F.softmax(torch.rand(1, 2, 256, 256), 1)
# prob2 =  F.softmax(torch.rand(1,2,256,256),1)
prob2 = copy.deepcopy(prob1)
prob3 = F.softmax(torch.rand(1, 2, 256, 256), 1)
# prob3 = copy.deepcopy(prob1)

ensemble_probs = torch.cat([prob1, prob2, prob3], 0)
distribution_number = ensemble_probs.shape[0]
Beispiel #27
0
def js_divergence(logits, ae_logits):
    a = dit.ScalarDistribution([0, 1], logits)
    b = dit.ScalarDistribution([0, 1], ae_logits)
    return jensen_shannon_divergence([a, b])
Beispiel #28
0
def extropy(dist, rvs=None, rv_mode=None):
    """
    Returns the extropy J[X] over the random variables in `rvs`.

    If the distribution represents linear probabilities, then the extropy
    is calculated with units of 'bits' (base-2).

    Parameters
    ----------
    dist : Distribution or float
        The distribution from which the extropy is calculated. If a float,
        then we calculate the binary extropy.
    rvs : list, None
        The indexes of the random variable used to calculate the extropy.
        If None, then the extropy is calculated over all random variables.
        This should remain `None` for ScalarDistributions.
    rv_mode : str, None
        Specifies how to interpret the elements of `rvs`. Valid options are:
        {'indices', 'names'}. If equal to 'indices', then the elements of
        `rvs` are interpreted as random variable indices. If equal to 'names',
        the the elements are interpreted as random variable names. If `None`,
        then the value of `dist._rv_mode` is consulted.

    Returns
    -------
    J : float
        The extropy of the distribution.

    """
    try:
        # Handle binary extropy.
        float(dist)
    except TypeError:
        pass
    else:
        # Assume linear probability for binary extropy.
        import dit
        dist = dit.ScalarDistribution([dist, 1 - dist])
        rvs = None
        rv_mode = RV_MODES.INDICES

    if dist.is_joint():
        if rvs is None:
            # Set to entropy of entire distribution
            rvs = list(range(dist.outcome_length()))
            rv_mode = RV_MODES.INDICES

        d = dist.marginal(rvs, rv_mode=rv_mode)
    else:
        d = dist

    pmf = d.pmf
    if d.is_log():
        base = d.get_base(numerical=True)
        npmf = d.ops.log(1 - d.ops.exp(pmf))
        terms = -base**npmf * npmf
    else:
        # Calculate entropy in bits.
        log = get_ops(2).log
        npmf = 1 - pmf
        terms = -npmf * log(npmf)

    J = np.nansum(terms)
    return J