def evaluate_distibution(event_name, words, session, ids): """ Do a complete evaluation of the distribution of words for an event. Compute Jensen-Shannon and Jaccard Index :param event_name: Name of the event to be evaluated :return: """ print(event_name, words) words_event, distribution_event, pairs_event = calculate_distribution_event(event_name, session, ids, True) path_references = Path(LOCAL_DATA_DIR_2, 'data', event_name, 'summaries', 'reference') references_list = [reference for reference in path_references.iterdir() if reference.is_file()] event_dist = dit.ScalarDistribution(words_event, distribution_event) words_set_event = set(words_event[:words]) print('Most Common words in event: {}'.format(words_set_event)) total_dist, all_words = global_distribution(references_list) all_words_set = set(all_words[:words]) jaccard = len(words_set_event.intersection(all_words_set)) / len(words_set_event.union(all_words_set)) print('Most Common words in all timelines: {}'.format(all_words_set)) print('Jaccard Index with all timelines: {}'.format(jaccard)) print('Jensen-Shannon with all timelines: {}'.format(jensen_shannon_divergence([total_dist, event_dist]))) for reference in references_list: words_timeline, probs_timeline, pairs_timeline = calculate_distribution_timeline(event_name, reference) dist_timeline = dit.ScalarDistribution(words_timeline, probs_timeline) print('----------------------------') word_set_timeline = set(words_timeline[:words]) print(reference.name) print('Most Common words in timeline: {}'.format(word_set_timeline)) print('Jensen-Shannon: {}'.format(jensen_shannon_divergence([dist_timeline, event_dist]))) jaccard = len(words_set_event.intersection(word_set_timeline)) / len(words_set_event.union(word_set_timeline)) print('Jaccard Index: {}'.format(jaccard))
def JSD_pos_dataframe(matrix): """ Computes the Jensen Shannon Divergence from the pos distributions @args : - the statfile with pos distribution @output : - a dataframe with distance for every permutations (including fr/fr) """ test = dict() langues = matrix.index.tolist() pos = matrix.columns.tolist() for l1, l2 in itertools.product(sorted(langues), repeat=2): # AA AB BA BB distrib_1 = matrix.loc[l1].tolist() distrib_2 = matrix.loc[l2].tolist() X = dit.ScalarDistribution(pos, distrib_1) Y = dit.ScalarDistribution(pos, distrib_2) JS = dit.divergences.jensen_shannon_divergence([X, Y]) # print("l1 {}\tl2 {}\t Jensen-Shannon Divergence {}\n".format(l1, l2, JS)) if l1 in test: test[l1][l2] = JS else: test[l1] = dict() test[l1][l2] = JS # df = pandas.DataFrame(test) return df
def fitness(individual, data): individual = vector_to_dna(individual) # fitness = np.linalg.norm(target - vector(individual)) fitness = jensen_shannon_divergence([ dit.ScalarDistribution(target), dit.ScalarDistribution(vector(individual)) ]) return fitness
def mutual_info(self, sentence_a, sentence_b): """ Computing the manifold of metric of information Mutual information Joint Information Conditioned Information Loss Conditioned Information Noise Self-Information """ vocab = self.vocab.copy() token_counts_1 = self.__get_cnts(sentence_a, vocab) token_counts_2 = self.__get_cnts(sentence_b, vocab) self.logging.info('token count processed') self.logging.info('vocab #'+ str(len(self.vocab.keys()))) alphabet_source = list(set(token_counts_1.keys())) self.logging.info('alphabet_source #'+ str(len(alphabet_source)) ) alphabet_target = list(set(token_counts_2.keys())) self.logging.info('alphabet_target #'+ str(len(alphabet_target)) ) self.logging.info('diff src2tgt #'+ str(set(token_counts_1.keys()) - set(token_counts_2.keys()))) self.logging.info('diff tgt2src #'+ str(set(token_counts_2.keys()) - set(token_counts_1.keys()))) assert( len(alphabet_source) == len(alphabet_target) ) #Computing Self-Information (or Entropy) scalar_distribution_source = dit.ScalarDistribution(alphabet_source, self.__get_freqs( token_counts_1 ) ) entropy_source = dit.shannon.entropy( scalar_distribution_source ) scalar_distribution_target = dit.ScalarDistribution(alphabet_target, self.__get_freqs( token_counts_2 ) ) entropy_target = dit.shannon.entropy( scalar_distribution_target ) #Computing Joint-information token_counts = { token: (token_counts_1[token] + token_counts_2[token]) for token in vocab } alphabet = list(set(token_counts.keys())) self.logging.info('alphabet #'+ str(len(alphabet))) frequencies = self.__get_freqs(token_counts) ##WARNING! if a document is empty frequencies might create an issue! scalar_distribution = dit.ScalarDistribution(alphabet, frequencies) joint_entropy = dit.shannon.entropy( scalar_distribution ) #Computing Mutual-Information mutual_information = entropy_source + entropy_target - joint_entropy #Computing Noise noise = joint_entropy - entropy_target #Computing Loss loss = joint_entropy - entropy_source return [entropy_source, entropy_target, joint_entropy, mutual_information, loss, noise]
def create_distribution(event, reference_name, summary_name): path_reference = Path(LOCAL_DATA_DIR_2, 'data', event, 'summaries', 'reference', reference_name) path_summary = Path(LOCAL_DATA_DIR_2, 'data', event, 'summaries', 'system', summary_name) with path_reference.open('r') as reference, path_summary.open('r') as summary: reference_text = reference.read() summary_text = summary.read() summary_words, summary_probs, _ = calculate_vocab_distribution(summary_text) reference_words, reference_probs, _ = calculate_vocab_distribution(reference_text) summary_dist = dit.ScalarDistribution(summary_words, summary_probs) reference_dist = dit.ScalarDistribution(reference_words, reference_probs) return reference_dist, summary_dist
def fitness(individual, data): individual = vector_to_dna(individual) if mode == "JSD": return jensen_shannon_divergence([ dit.ScalarDistribution(target / len(k)), dit.ScalarDistribution(vector(individual) / len(k)) ]) elif mode == "ED": return np.linalg.norm(target - vector(individual)) else: raise Exception("Fitness mode must be JSD or ED")
def calculate_jsd(input_values, input_probabilities): """ Calculated Jensen-Shannon Divergence upon the table """ processed_probabilities = scale_table(input_probabilities) x = dit.ScalarDistribution(amino_list, input_values, sample_space = amino_list, sort = True) jsd_values = [] for row in processed_probabilities: y = dit.ScalarDistribution(amino_list, row, sample_space = amino_list, sort = True) jsd_values.append(jensen_shannon_divergence([x,y])) return jsd_values
def msi(self, sentence_a, sentence_b): '''@danaderp Minimum Shared Information''' vocab = self.vocab.copy() token_counts_1 = self.__get_cnts(sentence_a, vocab) token_counts_2 = self.__get_cnts(sentence_b, vocab) self.logging.info('token count processed') #Minimum Shared Tokens token_counts = { token: min(token_counts_1[token],token_counts_2[token]) for token in vocab } alphabet = list(set(token_counts.keys())) #[ list(set(cnt.keys())) for cnt in token_counts ] frequencies = self.__get_freqs(token_counts) #[ get_freqs(cnt) for cnt in token_counts ] self.logging.info('frequencies processed') if not frequencies: #"List is empty" "nan Means that src and target do not share information at all" entropies = float('nan') extropies = float('nan') self.logging.info('FREQUENCIES NOT COMPUTED!!!<--------------') else: scalar_distribution = dit.ScalarDistribution(alphabet, frequencies) #[dit.ScalarDistribution(alphabet[id], frequencies[id]) for id in range( len(token_counts) )] self.logging.info('scalar_distribution processed') entropies = dit.shannon.entropy( scalar_distribution ) #[ dit.shannon.entropy( dist ) for dist in scalar_distribution ] self.logging.info('entropies processed') extropies = dit.other.extropy( scalar_distribution )# [ dit.other.extropy( dist ) for dist in scalar_distribution ] self.logging.info('extropies processed') return [entropies,extropies]
def DtoSD(dist, extract): """ Convert a Distribution to a ScalarDistribution. Parameters ---------- dist : Distribution The Distribution to convert to a ScalarDistribution. extract : bool If `True` and the outcome length is 1, then we extract the sole element from each outcome and use that value as the scalar outcome. """ if extract and dist.outcome_length() == 1: outcomes = tuple(outcome[0] for outcome in dist.outcomes) sample_space = dist.alphabet[0] else: outcomes = dist.outcomes sample_space = None # If people really want it, we can use _make_distribution. # But we have to decide if we want to set the alphabet to the # entire sample or just the sample space represented in outcomes. d = dit.ScalarDistribution(outcomes, dist.pmf, sample_space=sample_space, base=dist.get_base(), prng=dist.prng, sort=False, sparse=dist.is_sparse(), validate=False) return d
def calculate_Y(X, YgX, model="ising"): X.make_dense() Y_pmf = (X.pmf[:, np.newaxis] * YgX).sum(axis=0) outcomes = [-1, 1] if model == "ising" else [0, 1] Yd = {o: p for o, p in zip(outcomes, Y_pmf)} Y = dit.ScalarDistribution(Yd) return Y
def test_product_nonjoint(): """ Test product_distribution() from a ScalarDistribution. """ d = dit.ScalarDistribution([.5, .5]) with pytest.raises(Exception): dit.product_distribution(d)
def test_product_nonjoint(): """ Test product_distribution() from a ScalarDistribution. """ d = dit.ScalarDistribution([.5, .5]) assert_raises(Exception, dit.product_distribution, d)
def get_dist(token_counts): '''Takes in a counter object of token occurrences, computes the entropy of the corpus that produced it''' alphabet = list(set(token_counts.keys())) frequencies = get_freqs(token_counts) # for token in token_counts:s # frequencies.append((token_counts[token])/num_tokens) # logging.info(f'alphabet size {len(alphabet)}, freq size {len(frequencies)} alphabet - {list(token_counts.keys())}') return dit.ScalarDistribution(alphabet, frequencies)
def calculate_XY(X, YgX, model="ising"): X.make_dense() a = X.alphabet outcomes = list(itertools.product(a, repeat=X.outcome_length() + 1)) XY_pmf = X.pmf[:, np.newaxis] * YgX XYd = {o: p for o, p in zip(outcomes, XY_pmf)} XY = dit.ScalarDistribution(XYd) return XY
def get_dist(token_counts): '''Takes in a counter object of token occurrences, computes the entropy of the corpus that produced it''' num_tokens = sum(token_counts.values()) outcomes = list(set(token_counts.elements())) frequencies = [] for token in token_counts: frequencies.append((token_counts[token]) / num_tokens) return dit.ScalarDistribution(outcomes, frequencies)
def test_pruned_samplespace_scalar(): """Prune a sample space from a ScalarDistribution.""" pmf = [1 / 2, 0, 1 / 2] d = dit.ScalarDistribution(pmf) d2 = dit.algorithms.pruned_samplespace(d) ss2_ = [0, 2] ss2 = list(d2.sample_space()) assert ss2 == ss2_ assert np.allclose(d2.pmf, [1 / 2, 1 / 2])
def test_expanded_samplespace2(): """Expand a sample space from a ScalarDistribution.""" pmf = [1 / 2, 1 / 2] ss = [0, 1] d = dit.ScalarDistribution(pmf) assert list(d.sample_space()) == ss ss2 = [0, 1, 2] d2 = dit.algorithms.expanded_samplespace(d, ss2) assert list(d2.sample_space()) == ss2
def dit_shannon(token_counts): num_tokens = 0 for token in token_counts: num_tokens += token_counts[token] outcomes = list(set(token_counts.elements())) frequencies = [] for token in token_counts: frequencies.append((token_counts[token]) / num_tokens) d = dit.ScalarDistribution(outcomes, frequencies) return dit.shannon.entropy(d)
def test_pruned_samplespace(): """Prune a sample space from a Distribution.""" outcomes = ['0', '1', '2'] pmf = [1 / 2, 0, 1 / 2] d = dit.ScalarDistribution(outcomes, pmf) d2 = dit.algorithms.pruned_samplespace(d) ss2_ = ['0', '2'] ss2 = list(d2.sample_space()) assert ss2 == ss2_ assert np.allclose(d2.pmf, [1 / 2, 1 / 2])
def test_pruned_samplespace2(): """Prune a sample space while specifying a desired sample space.""" outcomes = ['0', '1', '2', '3'] pmf = [1 / 2, 0, 1 / 2, 0] ss2_ = ['0', '1', '2'] d = dit.ScalarDistribution(outcomes, pmf) d2 = dit.algorithms.pruned_samplespace(d, sample_space=ss2_) # We must make it dense, since the zero element will not appear in pmf. d2.make_dense() ss2 = list(d2.sample_space()) assert ss2 == ss2_ assert np.allclose(d2.pmf, [1 / 2, 0, 1 / 2])
def global_distribution(references_list): """ Calculate the distribution of words, considering the concatenation of all timelines for the event :param references_list: list with the name of the files with the timelines. :return: words, all words sorted by probability; distribution, probabilities of the words pairs, tuple of (word, probability) """ total_reference = '' for reference in references_list: with reference.open() as f: total_reference = total_reference + f.read() words, probs, pairs = calculate_vocab_distribution(total_reference) total_distribution = dit.ScalarDistribution(words, probs) return total_distribution, words
def calc_stats(sequence, verbose): """ INPUT: * sequence - Sequence sampled from seq_gen class OUTPUT: - Summary Statistics: * js_temp - Jensen-Shannon-Divergence between standard-between-dev empirical distributions compared between regimes """ sequence_sub = sequence[sequence[:, 2] != 0.5, :] deviants, regime_switches = find_deviants(sequence) # Catch trial/regime switch prob catch_prob = len(sequence[sequence[:, 2] == 0.5, 0]) / sequence.shape[0] switch_prob = regime_switches / sequence.shape[0] stim_prob_overall = len(sequence[sequence[:, 2] == 1, 2]) / ( len(sequence[sequence[:, 2] == 1, 2]) + len(sequence[sequence[:, 2] == 0, 2])) # 0th Order Stimulus probability (empirical) stim_prob_reg0 = np.mean(sequence[sequence[:, 1] == 0, 2]) stim_prob_reg1 = np.mean(sequence[sequence[:, 1] == 1, 2]) # 1st Order Stimulus prob (empirical) alt_prob_reg0 = np.mean(deviants[deviants[:, 0] == 0, 1]) alt_prob_reg1 = np.mean(deviants[deviants[:, 0] == 1, 1]) # Empirical pmf of standards between deviants for both regimes reg_0_dev = deviants[deviants[:, 0] == 0, :] reg_1_dev = deviants[deviants[:, 0] == 1, :] # Average train-length per regime: avg_train_reg0 = (np.sum(deviants[deviants[:, 0] == 0, 3]) / np.count_nonzero(deviants[deviants[:, 0] == 0, 3])) avg_train_reg1 = (np.sum(deviants[deviants[:, 0] == 1, 3]) / np.count_nonzero(deviants[deviants[:, 0] == 1, 3])) # Time spent in Regimes trials_in_reg0 = deviants[deviants[:, 0] == 0, 0] time_reg0 = trials_in_reg0.shape[0] / deviants.shape[0] try: epmf_reg_0_dev = np.histogram(reg_0_dev[:, 3], bins=int(np.max(reg_0_dev[:, 3])), density=True) epmf_reg_1_dev = np.histogram(reg_1_dev[:, 3], bins=int(np.max(reg_1_dev[:, 3])), density=True) # Calculate symmetric Jensen - Shannon divergence d1 = dit.ScalarDistribution(epmf_reg_0_dev[1][:-1], epmf_reg_0_dev[0]) d2 = dit.ScalarDistribution(epmf_reg_1_dev[1][:-1], epmf_reg_1_dev[0]) js_temp = jensen_shannon_divergence([d1, d2]) except: js_temp = None if verbose: print( "Empirical Probabilities: \n Empirical Catch Prob.: {} \n Empirical Regime Switch Prob.: {} \n Empirical Overall High-Intensity Stimulus Prob.: {} \n Empirical Regime 0 High-Intensity Stimulus Prob.: {} \n Empirical Regime 1 High-Intensity Stimulus Prob.: {} \n Empirical Regime 0 Alternation Prob.: {} \n Empirical Regime 1 Alternation Prob.: {} \n JS Div. Deviant Waiting Time Distr. between Regimes: {} \n Time in Regime 0: {} \n Average Train Length in Regime 0: {} \n Average Train Length in Regime 1: {}" .format(catch_prob, switch_prob, stim_prob_overall, stim_prob_reg0, stim_prob_reg1, alt_prob_reg0, alt_prob_reg1, js_temp, time_reg0, avg_train_reg0, avg_train_reg1)) print("--------------------------------------------") stats_out = { "emp_catch_prob": catch_prob, "emp_overall_sp": stim_prob_overall, "emp_reg0_sp": stim_prob_reg0, "emp_reg1_sp": stim_prob_reg1, "emp_reg0_ap": alt_prob_reg0, "emp_reg1_ap": alt_prob_reg1, "js_div": js_temp, "avg_train_r0": avg_train_reg0, "avg_train_r1": avg_train_reg1 } return stats_out, reg_0_dev, reg_1_dev
def test_rvfunctions_scalardist(): d = dit.ScalarDistribution(range(5), [1 / 5] * 5) assert_raises(ditException, dit.RVFunctions, d)
rand_seq_1 = create_random_seq(sl) rand_seq_2 = create_random_seq(sl) #kmer_freqs_1 = k_mer_frequencies(rand_seq_1, k, include_missing = True) #kmer_freqs_2 = k_mer_frequencies(rand_seq_2, k, include_missing = True) #print(kmer_freqs_1) #print(kmer_freqs_2) vector_1 = vector(rand_seq_1, [k]) vector_2 = vector(rand_seq_2, [k]) eds.append(np.linalg.norm(vector_1 - vector_2)) jsds.append( jensen_shannon_divergence([ dit.ScalarDistribution(vector_1), dit.ScalarDistribution(vector_2) ])) plt.figure() plt.scatter(eds, jsds, edgecolor='black', linewidth='1', alpha=0.5, facecolor='green') plt.xlabel('Euclidean Distance') plt.ylabel('Jensen-Shannon Divergence') plt.title('JSD vs. ED, k = ' + str(k) + ', seq_len = ' + str(sl)) plt.show()
def test_rvfunctions_scalardist(): d = dit.ScalarDistribution(range(5), [1 / 5] * 5) with pytest.raises(ditException): dit.RVFunctions(d)
Kl_loss = nn.KLDivLoss(reduce=True, size_average=False) JS_Div_loss = Kl_loss(torch.log(ensemble_probs), Mixture_dist) ## for this, using ln JS_by_hand = 0.5*(prob1[0][0][0][0]*torch.log(prob1[0][0][0][0]/ensemble_probs[0][0][0][0])\ +prob1[0][1][0][0]*torch.log(prob1[0][1][0][0]/ensemble_probs[0][1][0][0]) \ + prob3[0][0][0][0] * torch.log(prob3[0][0][0][0] / ensemble_probs[0][0][0][0]) \ +prob3[0][1][0][0]*torch.log(prob3[0][1][0][0]/ensemble_probs[0][1][0][0])) print('implemented JS by pytorch: ', JS_Div_loss, ' , implemented by hands:', JS_by_hand) # for this, using log2 import dit, numpy as np from dit.divergences import jensen_shannon_divergence X = dit.ScalarDistribution(['0', '1'], prob1.numpy().ravel()) Y = dit.ScalarDistribution(['0', '1'], prob3.numpy().ravel()) print('JS-div in log2:', jensen_shannon_divergence([X, Y]), ' , in ln: ', jensen_shannon_divergence([X, Y]) / np.log(2) * np.log(np.e)) ## image examples: prob1 = F.softmax(torch.rand(1, 2, 256, 256), 1) # prob2 = F.softmax(torch.rand(1,2,256,256),1) prob2 = copy.deepcopy(prob1) prob3 = F.softmax(torch.rand(1, 2, 256, 256), 1) # prob3 = copy.deepcopy(prob1) ensemble_probs = torch.cat([prob1, prob2, prob3], 0) distribution_number = ensemble_probs.shape[0]
def js_divergence(logits, ae_logits): a = dit.ScalarDistribution([0, 1], logits) b = dit.ScalarDistribution([0, 1], ae_logits) return jensen_shannon_divergence([a, b])
def extropy(dist, rvs=None, rv_mode=None): """ Returns the extropy J[X] over the random variables in `rvs`. If the distribution represents linear probabilities, then the extropy is calculated with units of 'bits' (base-2). Parameters ---------- dist : Distribution or float The distribution from which the extropy is calculated. If a float, then we calculate the binary extropy. rvs : list, None The indexes of the random variable used to calculate the extropy. If None, then the extropy is calculated over all random variables. This should remain `None` for ScalarDistributions. rv_mode : str, None Specifies how to interpret the elements of `rvs`. Valid options are: {'indices', 'names'}. If equal to 'indices', then the elements of `rvs` are interpreted as random variable indices. If equal to 'names', the the elements are interpreted as random variable names. If `None`, then the value of `dist._rv_mode` is consulted. Returns ------- J : float The extropy of the distribution. """ try: # Handle binary extropy. float(dist) except TypeError: pass else: # Assume linear probability for binary extropy. import dit dist = dit.ScalarDistribution([dist, 1 - dist]) rvs = None rv_mode = RV_MODES.INDICES if dist.is_joint(): if rvs is None: # Set to entropy of entire distribution rvs = list(range(dist.outcome_length())) rv_mode = RV_MODES.INDICES d = dist.marginal(rvs, rv_mode=rv_mode) else: d = dist pmf = d.pmf if d.is_log(): base = d.get_base(numerical=True) npmf = d.ops.log(1 - d.ops.exp(pmf)) terms = -base**npmf * npmf else: # Calculate entropy in bits. log = get_ops(2).log npmf = 1 - pmf terms = -npmf * log(npmf) J = np.nansum(terms) return J