def propTrain_split(words, labels, tagsInOrder, config): config.set_split('propTrain') num_data = len(words) data_ydist, _, _ = getLabelDist(labels, tagsInOrder) min_loss = 9999999 best_split = None for split in random_split(num_data, config.max_sampling_trials, config.ratio_split): train, dev, test = split # Label distribution for each split train_ydist, train_nlabels, _ = getLabelDist(labels[train], tagsInOrder) test_ydist, _, _ = getLabelDist(labels[test], tagsInOrder) dev_ydist, _, _ = getLabelDist(labels[dev], tagsInOrder) if train_nlabels == 41: kl_loss = kl(train_ydist, data_ydist) if kl_loss < min_loss: min_loss = kl_loss best_split = split plotbar(data_ydist, train_ydist, dev_ydist, test_ydist, train_nlabels, save_filename='{}.png'.format(config.split)) print('loss: {}'.format(kl_loss)) train, dev, test = best_split write_linebased(words[train], labels[train], config.filenames_split[0]) write_linebased(words[dev], labels[dev], config.filenames_split[1]) write_linebased(words[test], labels[test], config.filenames_split[2])
def propAll_split(words, labels, tagsInOrder, config): config.set_split('propAll') num_data = len(words) data_ydist, _, _ = getLabelDist(labels, tagsInOrder) min_loss = 9999999 best_split = None for split in random_split(num_data, config.max_sampling_trials, config.ratio_split): train, dev, test = split # Label distribution for each split train_ydist, train_nlabels, _ = getLabelDist(labels[train], tagsInOrder) dev_ydist, dev_nlabels, _ = getLabelDist(labels[dev], tagsInOrder) test_ydist, test_nlabels, _ = getLabelDist(labels[test], tagsInOrder) if np.all(np.array([train_nlabels, test_nlabels, dev_nlabels]) == 41): kl1 = kl(dev_ydist, train_ydist) kl2 = kl(test_ydist, train_ydist) kl_loss = 0.3 * (kl1 + kl2) + 0.7 * abs(kl1 - kl2) if kl_loss < min_loss: min_loss = kl_loss best_split = split plotbar(data_ydist, train_ydist, dev_ydist, test_ydist, 41, save_filename='{}.png'.format(config.split)) print("loss: {}".format(kl_loss)) print("--- kl,(dev, test): {}, {}".format(kl1, kl2)) train, dev, test = best_split write_linebased(words[train], labels[train], config.filenames_split[0]) write_linebased(words[dev], labels[dev], config.filenames_split[1]) write_linebased(words[test], labels[test], config.filenames_split[2])
def kl_distance(x, y): """ Wrapper for scipy.stats.entropy Parameters ---------- x: Numpy.array y: Numpy.array Returns ------- float Kullback–Leibler divergence """ div = kl(x, y) assert div is not None, 'KL divergence is Null' return div
def get_kl_indexes(self, item): sentences = item.quote.get_lemmas() ntokens = item.quote._nlemmas_by_sentence full_text = sum(sentences, Counter()) full_text_nwords = sum(full_text.values()) full_text_dist = [ (full_text[key] + 1) / (full_text_nwords + len(full_text.keys())) for key in full_text.keys() ] summary_idx = [] summary_ntokens = 0 summary_counts = Counter() while summary_ntokens < self.mean_ntokens: current_scores = np.ones((len(sentences))) for idx, lemmas in enumerate(sentences): if idx not in summary_idx: if summary_ntokens > 0: lemmas = summary_counts + lemmas dist = [(lemmas[key] + 1) / (sum(lemmas.values()) + len(full_text.keys())) for key in full_text.keys()] current_scores[idx] = kl(full_text_dist, dist) idx_max = np.argmin(current_scores) idx_ntokens = ntokens[idx_max] if summary_ntokens > 0 and \ abs(self.mean_ntokens - summary_ntokens - idx_ntokens) > abs(self.mean_ntokens - summary_ntokens): return summary_idx summary_idx.append(idx_max) summary_counts += sentences[idx_max] summary_ntokens += idx_ntokens # print(summary_counts) if summary_ntokens is 0: print( f"No summary for {item.uid} | {int(item.quote.ntokens / item.quote.nsents)}\t| {item.quote.cleaned_text}" ) return summary_idx
def kl_divergence(x, y): return kl(x.T, y.T).mean()
def generate_balance_sets(config): if not os.path.exists(config.filename_raw): generate_semcor_data(config.filename_raw) raw = SemcorDataset(config.filename_raw) tagsInOrder = [] i = 0 for k, _ in config.vocab_tags.items(): if k.startswith('B'): tagsInOrder.append(k) i += 1 _words, _labels = [], [] for word, label, _ in raw: _words.append(word) _labels.append(label) # Q_raw, _ = getTagDistFromList(_labels) min_loss = 100000 best_seed = None for i in range(500): # Generate split randomly seed_shuffle = random.randint(0, 2**32 - 1) words, labels = shuffle(_words, _labels, random_state=seed_shuffle) seed1 = random.randint(0, 2**32 - 1) X, Xtest, y, ytest = train_test_split(words, labels, test_size=config.ratio_split[2], random_state=seed1) dev_size = config.ratio_split[1] / np.sum(config.ratio_split[:1]) seed2 = random.randint(0, 2**32 - 1) Xtrain, Xdev, ytrain, ydev = train_test_split(X, y, test_size=dev_size, random_state=seed2) # Calculate KL divergence of each subset with respect to the original distribution ydist_train, nlabels_train, freq_train = getTagDistFromList( ytrain, tagsInOrder) ydist_test, nlabels_test, freq_test = getTagDistFromList( ytest, tagsInOrder) ydist_dev, nlabels_dev, freq_dev = getTagDistFromList( ydev, tagsInOrder) if np.all(np.array([nlabels_train, nlabels_test, nlabels_dev]) == 41): # kl1 = kl(ydist_train, Q_raw) # kl2 = kl(ydist_test, Q_raw) # kl3 = kl(ydist_dev, Q_raw) # kl_loss = kl1 + kl2 + kl3 + abs(kl1-kl2) + abs(kl2-kl3) + abs(kl3-kl1) kl1 = kl(ydist_test, ydist_train) kl2 = kl(ydist_dev, ydist_train) kl_loss = 0.3 * (kl1 + kl2) + 0.7 * abs(kl1 - kl2) split_seeds = (seed_shuffle, seed1, seed2) if kl_loss < min_loss: min_loss = kl_loss best_seed = split_seeds # print("New best kl loss: {}".format(kl_loss)) # print("--- seed: {}".format(best_seed)) # print("--- kls: {}, {}, {}".format(kl1, kl2, kl3)) print("New best kl loss: {}, {}".format(i, kl_loss)) print("--- seed: {}".format(best_seed)) print("--- kl,(test, dev): {}, {}".format(kl1, kl2)) '''print("--- freq:") print(" -- train {}".format(freq_train)) print(" -- dev {}".format(freq_dev)) print(" -- test {}".format(freq_test)) print("--- dist:") print(" -- train {}".format(ydist_train)) print(" -- dev {}".format(ydist_dev)) print(" -- test {}".format(ydist_test))''' plotbar(ydist_train, ydist_dev, ydist_test, 41) # Plot the distribution of the best split words, labels = shuffle(_words, _labels, random_state=best_seed[0]) X, Xtest, y, ytest = train_test_split(words, labels, test_size=config.ratio_split[2], random_state=best_seed[1]) dev_size = config.ratio_split[1] / np.sum(config.ratio_split[:1]) Xtrain, Xdev, ytrain, ydev = train_test_split(X, y, test_size=dev_size, random_state=best_seed[2]) ydist_train, nlabels_train, label_freq_train = getTagDistFromList( ytrain, tagsInOrder) ydist_test, _, label_freq_test = getTagDistFromList(ytest, tagsInOrder) ydist_dev, _, label_freq_dev = getTagDistFromList(ydev, tagsInOrder) # Bar plot plotbar(ydist_train, ydist_dev, ydist_test, 41, save=True) write_linebased(Xtrain, ytrain, config.filenames_split[0]) write_linebased(Xdev, ydev, config.filenames_split[1]) write_linebased(Xtest, ytest, config.filenames_split[2])